I tell you what I want to do. I am doing a scraping of tematika.com site to be able to create a base of books to me. The problem is that I need to go from page to page but I can not, so I always get the same books. I think with the code they will understand it better.
const phantom = require("phantom");
const cheerio = require("cheerio");
let _instance, _page;
let _books = [];
phantom.create().then(instance => {
_instance = instance;
return _instance.createPage();
}).then(page => {
_page = page;
return _page.open('http://www.tematika.com/catalogo/libros/arte__arquitectura_y_diseno--9.htm');
}).then(status => {
console.log(status);
return _page.property('content')
}).then(html => {
// I transform the html into an object to manipulate it with cheerio
let $ = cheerio.load(html);
//I get all the books and pass them to an array to go through them and extract the info I'm looking for
let books = $('.Gcentro').find('.moduleproductob').toArray();
//As always there are 10 pages I make a for it it itere 10 times
for(let i=0;i<10;i++){
books.forEach(book => {
let _book = {};
_book.imageMin = $(book).find('.Gimagesproductos').attr('src');
_book.linkBook = $(book).find('.FProductos').attr('href');
_books.push(_book);
});
//In this part what I look for is to click on the next button on the page and it is what is not working for me
//What ends up happening is that it never goes back to the page. So always bring me the info of the same 10 books.
_page.evaluate(function() {
var evObj = document.createEvent('Events');
evObj.initEvent('click', true, false);
document.getElementById('catalogNext').dispatchEvent(evObj);
});
}
console.log(_books);
_page.close();
_instance.exit();
}).catch(e => console.log(e));
What I need is to be able to click on next and be able to bring me the books that are showing. If it is not with phantom-node and cheerio can be with another tool, all I need is that it works on node.
Thank you! :)
Related
I have a working puppeteer script that I'd like to make into an API but I'm having problems with waitForSelector.
Background:
I wrote a puppeteer script that successfully searches for and scrapes the result of a query I specify in the code e.g. let address = xyz;. Now I'd like to make it into an API so that a user can query something. I managed to code everything necessary for the local API (working with express) and everything works as well. By that I mean: I coded all the server side stuff: I can make a request, the scraper function is called, puppeteer starts up, carries out my search (I need to type in an address, choose from a dropdown and press enter).
The status:
The result of my query is a form (basically 3 columns and some rows) in an iFrame and I want to scrape all the rows (I modify them into a specific json later on). The way it works is I use waitForSelector on the form's selector and then I use frame.evaluate.
Problem:
When I run my normal scraper everything works well, but when I run the (slightly modified but essentially same) code within the API framework, waitForSelector suddenly always times out. I have tried all the usual workarounds: waitForNavigation, taking a screenshot and inspecting etc but nothing helped. I've been reading quite a bit and could it be that I'm screwing something up in terms of async/await when I call my scraper from within the context of the API? I'm still quite new to this so please bear with me. This is the code of the working script - I indicated the important part
const puppeteer = require("puppeteer");
const chalk = require("chalk");
const fs = require('fs');
const error = chalk.bold.red;
const success = chalk.keyword("green");
address = 'Gumpendorfer Straße 12, 1060 Wien';
(async () => {
try {
// open the headless browser
var browser = await puppeteer.launch();
// open a new page
var page = await browser.newPage();
// enter url in page
await page.goto(`https://mein.wien.gv.at/Meine-Amtswege/richtwert?subpage=/lagezuschlag/`, {waitUntil: 'networkidle2'});
// continue without newsletter
await page.click('#dss-modal-firstvisit-form > button.btn.btn-block.btn-light');
// let everyhting load
await page.waitFor(1000)
console.log('waiting for iframe with form to be ready.');
//wait until selector is available
await page.waitForSelector('iframe');
console.log('iframe is ready. Loading iframe content');
//choose the relevant iframe
const elementHandle = await page.$(
'iframe[src="/richtwertfrontend/lagezuschlag/"]',
);
//go into frame in order to input info
const frame = await elementHandle.contentFrame();
//enter address
console.log('filling form in iframe');
await frame.type('#input_adresse', address, { delay: 100});
//choose first option from dropdown
console.log('Choosing from dropdown');
await frame.click('#react-autowhatever-1--item-0');
console.log('pressing button');
//press button to search
await frame.click('#next-button');
// scraping data
console.log('scraping')
await frame.waitForSelector('#summary > div > div > br ~ div');//This keeps failing in the API
const res = await frame.evaluate(() => {
const rows = [...document.querySelectorAll('#summary > div > div > br ~ div')];
const cells = rows.map(
row => [...row.querySelectorAll('div')]
.map(cell => cell.innerText)
);
return cells;
});
await browser.close();
console.log(success("Browser Closed"));
const mapFields = (arr1, arr2) => {
const mappedArray = arr2.map((el) => {
const mappedArrayEl = {};
el.forEach((value, i) => {
if (arr1.length < (i+1)) return;
mappedArrayEl[arr1[i]] = value;
});
return mappedArrayEl;
});
return mappedArray;
}
const Arr1 = res[0];
const Arr2 = res.slice(1,3);
let dataObj = {};
dataObj[address] = [];
// dataObj['lagezuschlag'] = mapFields(Arr1, Arr2);
// dataObj['adresse'] = address;
dataObj[address] = mapFields(Arr1, Arr2);
console.log(dataObj);
} catch (err) {
// Catch and display errors
console.log(error(err));
await browser.close();
console.log(error("Browser Closed"));
}
})();
I just can't understand why it would work in the one case and not in the other, even though I barely changed something. For the API I basically changed the name of the async function to const search = async (address) => { such that I can call it with the query in my server side script.
Thanks in advance - I'm not attaching the API code cause I don't want to clutter the question. I can update it if it's necessary
I solved this myself. Turns out the problem wasn't as complicated as I thought and it was annoyingly simple to solve. The problem wasn't with the selector that was timing out but with the previous selectors, specifically the typing and choosing from dropdown selectors. Essentially, things were going too fast. Before the search query was typed in, the dropdown was already pressed and nonsense came out. How I solved it: I included a waitFor(1000) call before the dropdown is selected and everything went perfectly. An interesting realisation was that even though that one selector timed out, it wasn't actually the source of the problem. But like I said, annoyingly simple and I feel dumb for asking this :) but maybe someone will see this and learn from my mistake
I am currently evaluating WebViewer version 5.2.8.
I need to set some javascript function/code as an action for triggers like calculate trigger, format trigger and keystroke trigger through the WebViewer UI.
Please help me on how to configure javascript code for a form field trigger in WebViewer UI.
Thanks in advance,
Syed
Sorry for the late response!
You will have to create the UI components yourself that will take in the JavaScript code. You can do something similar to what the FormBuilder demo does with just HTML and JavaScript. However, it may be better to clone the open source UI and add your own components.
As for setting the action, I would recommend trying out version 6.0 instead as there is better support for widgets and form fields in that version. However, we are investigating a bug with the field actions that will throw an error on downloading the document. You should be able to use this code to get it working first:
docViewer.on('annotationsLoaded', () => {
const annotations = annotManager.getAnnotationsList();
annotations.forEach(annot => {
const action = new instance.Actions.JavaScript({ javascript: 'alert("Hello World!")' });
// C cor Calculate, and F for Format
annot.addAction('K', action);
});
});
Once the bug has been dealt with, you should be able to download the document properly.
Otherwise, you will have to use the full API and that may be less than ideal. It would be a bit more complicated with the full API and I would not recommend it if the above feature will be fixed soon.
Let me know if this helps or if you need more information about using the full API to accomplish this!
EDIT
Here is the code to do it with the full API! Since the full API works at a low level and very closely to the PDF specification, it does take a lot more to make it work. You do still have to update the annotations with the code I provided before which I will include again.
docViewer.on('documentLoaded', async () => {
// This part requires the full API: https://www.pdftron.com/documentation/web/guides/full-api/setup/
const doc = docViewer.getDocument();
// Get document from worker
const pdfDoc = await doc.getPDFDoc();
const pageItr = await pdfDoc.getPageIterator();
while (await pageItr.hasNext()) {
const page = await pageItr.current();
// Note: this is a PDF array, not a JS array
const annots = await page.getAnnots();
const numAnnots = await page.getNumAnnots();
for (let i = 0; i < numAnnots; i++) {
const annot = await annots.getAt(i);
const subtypeDict = await annot.findObj('Subtype');
const subtype = await subtypeDict.getName();
const actions = await annot.findObj('AA');
// Check to make sure the annot is of type Widget
if (subtype === 'Widget') {
// Create the additional actions dictionary if it does not exist
if (!actions) {
actions = await annot.putDict('AA');
}
let calculate = await actions.findObj('C');
// Create the calculate action (C) if it does not exist
if (!calculate) {
calculate = await actions.putDict('C');
await Promise.all([calculate.putName('S', 'JavaScript'), calculate.putString('JS', 'app.alert("Hello World!")')]);
}
// Repeat for keystroke (K) and format (F)
}
}
pageItr.next();
}
});
docViewer.on('annotationsLoaded', () => {
const annotations = annotManager.getAnnotationsList();
annotations.forEach(annot => {
const action = new instance.Actions.JavaScript({ javascript: 'app.alert("Hello World!")' });
// K for Keystroke, and F for Format
annot.addAction('C', action);
});
});
You can probably put them together under the documentLoaded event but once the fix is ready, you can delete the part using the full API.
Node-red node for integrating with an older ventilation system, using screen scraping, nodejs with cheerio. Works fine for fetching some values now, but I seem unable to fetch the right element in the more complex structured telling which operating mode is active. Screenshot of structure attached. And yes, never used jquery and quite a newbie on cheerio.
I have managed, way to complex, to get the value, if it is within a certain part of the tree.
const msgResult = scraped('.control-1');
const activeMode = msgResult.get(0).children.find(x => x.attribs['data-selected'] === '1').attribs['id'];
But only works on first match, fails if the data-selected === 1 isn't in that part of the tree. Thought I should be able to use just .find from the top of the tree, but no matches.
const activeMode = scraped('.control-1').find(x => x.attribs['data-selected'] === '1')
What I would like to get from the html structure attached, is the ID of the div that has data-selected=1, which again can be below any of the two divs of class control-1. Maybe also the content of the underlying span, where the mode is described in text.
HTML structure
It's hard to tell what you're looking for but maybe:
$('.control-1 [data-selected="1"]').attr('id')
You should try to make some loop to check every tree.
Try this code, hope this works.
const cheerio = require ('cheerio')
const fsextra = require ('fs-extra');
(async () => {
try {
const parseFile = async (error, contentHTML) => {
let $ = await cheerio.load (contentHTML)
const selector = $('.control-1 [data-selected="1"]')
for (let num = 0; num < selector.length; num++){
console.log ( selector[num].attribs.id )
}
}
let activeMode = await fsextra.readFile('untitled.html', 'utf-8', parseFile )
} catch (error) {
console.log ('ERROR: ', error)
}
})()
When I trigger a .click() event in a non-headless mode in puppeteer, nothing happens, not even an error.. "non-headless mode so i could visually monitor what is being clicked"
const scraper = {
test: async () => {
let browser, page;
try {
browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"]
});
page = await browser.newPage();
} catch (err) {
console.log(err);
}
try {
await page.goto("https://www.betking.com/sports/s/eventOdds/1-840-841-0-0,1-1107-1108-0-0,1-835-3775-0-0,", {
waitUntil: "domcontentloaded"
});
console.log("scraping, wait...");
} catch (err) {
console.log(err);
}
console.log("waiting....");
try {
await page.waitFor('.eventsWrapper');
} catch (err) {
console.log(err, err.response);
}
try {
let oddsListData = await page.evaluate(async () => {
let regionAreaContainer = document.querySelectorAll('.areaContainer.region .regionGroup > .regionAreas > div:first-child > .area:nth-child(5)');
regionAreaContainer = Array.prototype.slice.call(regionAreaContainer);
let t = []; //Used to monitor the element being clicked
regionAreaContainer.forEach(async (region) => {
let dat = await region.querySelector('div');
dat.innerHTML === "GG/NG" ? t.push(dat.innerHTML) : false; //Used to confirm that the right element is being clicked
dat.innerHTML === "GG/NG" ? dat.click() : false;
})
return t;
})
console.log(oddsListData);
} catch (err) {
console.log(err);
}
}
}
I expect it to click the specified button and load in some dynamic data on the page.
In Chrome's console, I get the error
Transition Rejection($id: 1 type: 2, message: The transition has been superseded by a different transition, detail: Transition#3( 'sportsMultipleEvents'{"eventMarketIds":"1-840-841-0-0,1-1107-1108-0-0,1-835-3775-0-0,"} -> 'sportsMultipleEvents'{"eventMarketIds":"1-840-841-0-0,1-1107-1108-0-0,1-835-3775-535-14,"} ))
Problem
Behaving non-human-like by executing code like element.click() (inside the page context) or element.value = '..' (see this answer for a similar problem) seems to be problematic for Angular applications. You want to try to behave more human-like by using puppeteer functions like page.click() as they simulate a "real" mouse click instead of just triggering the element's click event.
In addition the page seems to rebuild parts of the page whenever one of the items is clicked. Therefore, you need to execute the selector again after each click.
Code sample
To behave more human-like and requery the elements after each click you can change the latter part of your code to something like this:
let list = await page.$x("//div[div/text() = 'GG/NG']");
for (let i = 0; i < list.length; i++) {
await list[i].click();
// give the page some time and then query the selectors again
await page.waitFor(500);
list = await page.$x("//div[div/text() = 'GG/NG']");
}
This code uses an XPath expression to query the div elements which contain another div element with the given text. After that, a click is simulated on the element and then the contents of the page are queried another time to respect the change of the DOM elements.
Here might be a less confusing way to click those:
for(var div of document.querySelectorAll('div')){
if(div.innerHTML === 'GG/NG') div.click()
}
I am a total scrub with the node http module and having some trouble.
The ultimate goal here is to take a huge list of urls, figure out which are valid and then scrape those pages for certain data. So step one is figuring out if a URL is valid and this simple exercise is baffling me.
say we have an array allURLs:
["www.yahoo.com", "www.stackoverflow.com", "www.sdfhksdjfksjdhg.net"]
The goal is to iterate this array, make a get request to each and if a response comes in, add the link to a list of workingURLs (for now just another array), else it goes to a list brokenURLs.
var workingURLs = [];
var brokenURLs = [];
for (var i = 0; i < allURLs.length; i++) {
var url = allURLs[i];
var req = http.get(url, function (res) {
if (res) {
workingURLs.push(?????); // How to derive URL from response?
}
});
req.on('error', function (e) {
brokenURLs.push(e.host);
});
}
what I don't know is how to properly obtain the url from the request/ response object itself, or really how to structure this kind of async code - because again, I am a nodejs scrub :(
For most websites using res.headers.location works, but there are times when the headers do not have this property and that will cause problems for me later on. Also I've tried console logging the response object itself and that was a messy and fruitless endeavor
I have tried pushing the url variable to workingURLs, but by the time any response comes back that would trigger the push, the for loop is already over and url is forever pointing to the final element of the allURLs array.
Thanks to anyone who can help
You need to closure url value to have access to it and protect it from changes on next loop iteration.
For example:
(function(url){
// use url here
})(allUrls[i]);
Most simple solution for this is use forEach instead of for.
allURLs.forEach(function(url){
//....
});
Promisified solution allows you to get a moment when work is done:
var http = require('http');
var allURLs = [
"http://www.yahoo.com/",
"http://www.stackoverflow.com/",
"http://www.sdfhksdjfksjdhg.net/"
];
var workingURLs = [];
var brokenURLs = [];
var promises = allURLs.map(url => validateUrl(url)
.then(res => (res?workingURLs:brokenURLs).push(url)));
Promise.all(promises).then(() => {
console.log(workingURLs, brokenURLs);
});
// ----
function validateUrl(url) {
return new Promise((ok, fail) => {
http.get(url, res => return ok(res.statusCode == 200))
.on('error', e => ok(false));
});
}
// Prevent nodejs from exit, don't need if any server listen.
var t = setTimeout(() => { console.log('Time is over'); }, 1000).ref();
You can use something like this (Not tested):
const arr = ["", "/a", "", ""];
Promise.all(arr.map(fetch)
.then(responses=>responses.filter(res=> res.ok).map(res=>res.url))
.then(workingUrls=>{
console.log(workingUrls);
console.log(arr.filter(url=> workingUrls.indexOf(url) == -1 ))
});
EDITED
Working fiddle (Note that you can't do request to another site in the browser because of Cross domain).
UPDATED with #vp_arth suggestions
const arr = ["/", "/a", "/", "/"];
let working=[], notWorking=[],
find = url=> fetch(url)
.then(res=> res.ok ?
working.push(res.url) && res : notWorking.push(res.url) && res);
Promise.all(arr.map(find))
.then(responses=>{
console.log('woking', working, 'notWorking', notWorking);
/* Do whatever with the responses if needed */
});
Fiddle