How to use click and waitForNavigation? - node.js

#!/usr/bin/env node
// vim: set noexpandtab tabstop=2:
const puppeteer = require('puppeteer');
const fs = require('fs').promises;
const cookies_json_file = process.argv[2];
const url = process.argv[3];
const timeout = parseInt(process.argv[4], 10);
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const cookiesString = await fs.readFile(cookies_json_file);
const cookies = JSON.parse(cookiesString);
await page.setCookie.apply(page, cookies);
try {
await page.goto(url, { waitUntil: 'networkidle2', timeout: timeout });
const content = await page.content();
page.on('response', async response => {
if(response.url().startsWith('https://www.genecards.org/gene/api/data/Enhancers?geneSymbol=')) {
response.buffer().then(function(data) {
fs.writeFile('/dev/stdout', data);
});
}
});
const linkHandlers = await page.$x('//div[#data-ga-category = "GeneHancer"]//a[#data-role = "show-all"]');
if (linkHandlers.length > 0) {
await Promise.all([
linkHandlers[0].click()
, page.waitForNavigation({waitUntil: 'networkidle2', timeout: timeout})
]);
} else {
throw new Error("Link not found");
}
} catch (e) {
console.error(e);
process.exit(1);
} finally {
await browser.close();
}
})();
I have the above main.js.
$ ./main.js cookies.json 'https://www.genecards.org/cgi-bin/carddisp.pl?gene=BSCL2' 30000
When I run using the above command, I got this error. Does anybody know how to fix the error? Thanks.
TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
at /usr/local/lib/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
-- ASYNC --
at Frame.<anonymous> (/usr/local/lib/node_modules/puppeteer/lib/helper.js:110:27)
at Page.waitForNavigation (/usr/local/lib/node_modules/puppeteer/lib/Page.js:649:49)
at Page.<anonymous> (/usr/local/lib/node_modules/puppeteer/lib/helper.js:111:23)
at main.js:33:12
at processTicksAndRejections (internal/process/task_queues.js:89:5) {
name: 'TimeoutError'
}
```

You will need to disable timeout by setting the timeout parameter to 0. The default is 30 seconds (which you are also passing as a command line argument from your example), so this is behaving as expected and throwing an exception because the timeout exceeded the default or user supplied value of 30000 milliseconds.
page.waitForNavigation({waitUntil: 'networkidle2', timeout: 0})
You can also pass the parameter from the command line as an argument, which is preferable so as not to hard-code the value:
$ ./main.js cookies.json 'https://www.genecards.org/cgi-bin/carddisp.pl?gene=BSCL2' 0

Related

puppeteer - Error: Protocol error (Network.getResponseBody): No resource with given identifier found

I'm trying with this code to get the response body from a website using puppeteer
#!/usr/bin/env node
require('dotenv').config();
const puppeteer = require('puppeteer');
const readline = require('readline').createInterface({
input: process.stdin,
output: process.stdout
});
const path = require('path');
const fs = require('fs');
//
console.log('Starting Puppeteer...');
let responseBody = [];
(async () => {
const browser = await puppeteer.launch({
headless: false,
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (request) => {
request.continue();
});
//
page.on('requestfinished', async (request) => {
const response = await request.response();
const url = response.url();
// store chunks url
if( url.startsWith('https://audio-akp-quic-control-examplecdn-com.akamaized.net/audio/') ){
console.log(await response.buffer());
//responseBody.push(response.buffer());
}
});
//
await page.goto('https://accounts.examplecdn.com/login', {
waitUntil: ['load', 'networkidle2']
});
const emailField = await page.waitForSelector('#login-username', {timeout: 3000});
await emailField.type(process.env.EMAIL, {delay: 100});
const passwordField = await page.waitForSelector('#login-password', {timeout: 3000});
await passwordField.type(process.env.PASSWORD, {delay: 100});
const submitButton = await page.waitForSelector('#login-button', {timeout: 3000});
await submitButton.click();
//
const navigation = await page.waitForNavigation({ waitUntil: ['load', 'networkidle2'] });
//if( navigation.url().endsWith('status') ){
await page.goto('https://example.cdn.com/search', {
waitUntil: ['load', 'networkidle2']
}).then( async (response) => {
//console.log(response);
const cookieButton = await page.$('#onetrust-accept-btn-handler');
await cookieButton.click();
const searchField = await page.$('[data-testid="search-input"]');
await readline.question('What track do you want to search for?', (input) => {
console.log('answer:', input);
searchField.type(input).then( async () => {
await page.waitForXPath('//*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[4]').then( async (element) => {
element.focus().then( async () => {
// //*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[3]/button
const playButton = await page.waitForXPath('//*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[3]/button');
await playButton.click();
});
});
});
});
});
//}
})();
I'm having problem with it and this error will be logged and the script will terminate.
/Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:208
this._callbacks.set(id, { resolve, reject, error: new Error(), method });
^
Error: Protocol error (Network.getResponseBody): No resource with given identifier found
at /Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:208:63
at new Promise (<anonymous>)
at CDPSession.send (/Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:207:16)
at /Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/HTTPResponse.js:99:53
at runMicrotasks (<anonymous>)
at processTicksAndRejections (node:internal/process/task_queues:93:5)
at async /Users/dev/Desktop/test/index.js:40:25
what I need to do is to collect all the response body content when a certain url is called, then using ffmpeg I want to convert it back to a full length track. How I can solve the problem, is possible to get the response body of each request and then join all togheter?

Puppeteer waitForSelector is not working inside loop

My task is form submission with different data. So, I am using puppeteer and for of loop.
Code example:
const puppeteer = require('puppeteer')
const data = require('data.json') // ~30 products
(async () => {
try {
const browser = await puppeteer.launch({
headless: false,
ignoreHTTPSErrors: true,
defaultViewport: null,
});
const page2 = await browser.newPage();
await page2.goTo('mywebsite', {waitUntil: 'domcontentloaded'} )
for (let product of data) {
// Waiting for some selector, after that do something with it
await page2.waitForSelector("#someSelector", { visible: true });
await page2.type("#someSelector", product.someData);
//
//... A lot of code that similar to above is here
//
// Go back after all things done
await page2.waitFor(2000);
await page2.waitForSelector('[title="home"]', { visible: true });
await page2.click('[title="home"]', { clickCount: 1 });
counter++;
console.log(
`===========================================================================${counter}`
);
}
} catch (err) {
throw new Error(err);
}
})();
The problem is that this is works, but not always works, for example, a loop can work 15 times, and then fall off or go through a full cycle without failing.
The error is always the same:
UnhandledPromiseRejectionWarning: Error: TimeoutError: waiting for selector "#someSelector" failed: timeout 30000ms exceeded
However, if I check the page, then everything is there, the elements are on the page, but puppeteer does not see them. How Can I fix this?
My current solution for this is a retry function:
const chalk = require("chalk");
const util = require("util");
const delay = util.promisify(setTimeout);
async function retry(fn, retryDelay = 200, numRetries = 5) {
for (let i = 0; i < numRetries; i++) {
try {
return await fn();
} catch (err) {
console.log(
"$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
);
console.log(chalk.yellow(err));
if (i === numRetries - 1) throw err;
await delay(retryDelay);
retryDelay = retryDelay * 2;
}
}
}

Difference when hashing page source with Node and Python

Goal:
To hash page source in order to detect changes between scheduled scraping.
Python code:
import requests
import hashlib
url = 'http://example.org/'
r = requests.get(url, verify=False,)
r.encoding = 'utf-8'
print(hashlib.sha256(r.text.encode('utf-8')).hexdigest())
Result: ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
Node & Puppeteer code:
const puppeteer = require('puppeteer');
var crypto = require('crypto');
(async()=> {
const browser= await puppeteer.launch();
const page= await browser.newPage();
try {
const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 });
console.log(crypto.createHash('sha256').update(response.text().toString()).digest('hex'));
} catch (e) {
console.log(e.message);
}
await browser.close();
})();
Result: b4e6060006b920bc021110ea8ab8d67744983e2b7ff75e1c8be5613af93f687d
Questions:
Why is there a difference? As far as I inspected, both methods
return the same response.
Can I get same results?
Are there a better ways to detect changes in page content?
You need to await the response text in puppeteer, otherwise you are hashing the stringified version of Promise { <pending> }
const puppeteer = require('puppeteer');
var crypto = require('crypto');
(async()=> {
const browser= await puppeteer.launch();
const page= await browser.newPage();
try {
const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 });
const source = await response.text();
console.log(crypto.createHash('sha256').update(source).digest('hex'));
} catch (e) {
console.log(e.message);
}
await browser.close();
})();
Output:
python c.py
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
node c.js
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9

Puppeteer do not recognizing the link

I'm trying to grab some html in for expression, but somehow I'm getting error
Error: Evaluation failed: ReferenceError: link is not defined
at __puppeteer_evaluation_script__:8:29
at ExecutionContext.evaluateHandle (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\ExecutionContext.js:124:13)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at ExecutionContext.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:144:27)
at ExecutionContext.evaluate (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\ExecutionContext.js:58:31)
at ExecutionContext.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:145:23)
at Frame.evaluate (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\FrameManager.js:447:20)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at Frame.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:144:27)
at Page.evaluate (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\Page.js:777:43)
at Page.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:145:23)
at zrGrabber.StartGrabbingHtml (C:\Repositories\auto-grabber-server\grabbers\zr.grabber.js:52:40)
at process._tickCallback (internal/process/next_tick.js:68:7)
Link has been passed to the StartGrabbingHtml function, but then I,m getting mentioned error. I suppose that something is wrong with async staff, but can't get what exactly.
const puppeteer = require("puppeteer");
let links = [];
const Mongo = require('./../db/mongo');
const zrLinks = [
"https://www.zr.ru/stories/consultant/optimalno/",
"https://www.zr.ru/news/avtomobili/",
"https://www.zr.ru/stories/prezentaciya-car/new/"
];
module.exports = class zrGrabber {
async startGrabbingLinks() {
try {
for (let i = 0; i < zrLinks.length; i++) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(zrLinks[i], {
waitUntil: 'load',
timeout: 0
});
const result = await page.evaluate(() => {
const links = document.querySelectorAll('div.head > h2 > a')
return [...links].map(link => link.href);
});
await page.close();
await browser.close();
links = [...links, ...result];
}
const db = new Mongo();
for (let i = 0; i < links.length; i++) {
// if link already in database skip grabbing
const found = await db.findLink(links[i]);
if (found) {
continue;
}
// else grab and write link to database
await this.StartGrabbingHtml(links[i])
}
} catch (err) {
console.log(err)
}
}
async StartGrabbingHtml(link) {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(link, {
waitUntil: 'load',
timeout: 0
});
const article = await page.evaluate(() => { // error throwing here
const date = document.querySelector('#storyDetailArticle > time').innerHTML;
const name = document.querySelector('#storyDetailArticle > h1').innerHTML;
const description = document.querySelector('#storyDetailArticle > div.stroy_announcement > h3').innerHTML;
const author = document.querySelector('#storyDetailArticle > div.announcement_author.story_author.no_preview > div').innerHTML;
const content = document.querySelector('#storyDetailArticle > div.stroy_content').innerHTML;
return {
source: link,
date: date,
name: name,
description: description,
author: author,
content: content
};
});
console.log(article)
const db = new Mongo();
await db.insertOne(article);
await page.close();
await browser.close();
} catch (err) {
console.log(err)
}
}
}
What I'm doing wrong here?
The script cannot access the variable link from inside the page.evaluate context.
You should pass it as an argument like this:
await page.evaluate(link => {
// ...
}, link);

use same browser instance?

Hi I am trying to make a screenshot service
const puppeteer = require('puppeteer');
var resWidth = 1366;
var resHeight = 1000;
var browser;
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
and when I receive a work I try to do
data.forEach(function(d){
try {
console.log(d["url"]);
(async () => {
var page = await browser.newPage();
await page.setViewport({width: resWidth, height: resHeight});
await page.goto(d["url"], {timeout: 90000, waitUntil: 'networkidle'});
await page.screenshot({path: './picdata/' + d['id'] + '.png' ,fullPage: true});
await page.close();
})();
} catch(e) {}
});
but I can't... here is the error:
(node:40596) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 7): TypeError: Cannot read property 'newPage' of undefined
I don't want to open a new browser for each screenshot launching browser takes time and requires more memory?
what should I do?
The problem:
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
This code never gets executed. Why? because it's not a true closure.
More on closures, here.
That being said, that wont have work for your given scenario, as they are async tasks.
My try with your example:
'use strict';
const puppeteer = require('puppeteer');
const resWidth = 1366;
const resHeight = 1000;
let browser;
let page;
async function launchBrowser() {
browser = await puppeteer.launch({ headless: true }); //this "{ headless: true }" will ensure a browser window is not open a single time.
};
launchBrowser().then((x) => { // wait until browser has launched
data.forEach(async (d) => {
try {
page = await browser.newPage(); // the document is not very clear about this method, so I am still unsure if I could move this in the above launchBrowser() method.
await page.setViewport({ width: resWidth, height: resHeight });
await page.goto(d['url'], { timeout: 90000, waitUntil: 'networkidle' });
await page.screenshot({ path: './picdata/' + d['id'] + '.png', fullPage: true });
}
catch (e) {
console.log(e);
await browser.close(); // close browser if there is an error
}
});
})
.then(() => {
await browser.close(); // close browser finally.
});
Pro Tip: Start using let, const instead of var.
There is a great article on this, here

Resources