#!/usr/bin/env node
// vim: set noexpandtab tabstop=2:
const puppeteer = require('puppeteer');
const fs = require('fs').promises;
const cookies_json_file = process.argv[2];
const url = process.argv[3];
const timeout = parseInt(process.argv[4], 10);
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
const cookiesString = await fs.readFile(cookies_json_file);
const cookies = JSON.parse(cookiesString);
await page.setCookie.apply(page, cookies);
try {
await page.goto(url, { waitUntil: 'networkidle2', timeout: timeout });
const content = await page.content();
page.on('response', async response => {
if(response.url().startsWith('https://www.genecards.org/gene/api/data/Enhancers?geneSymbol=')) {
response.buffer().then(function(data) {
fs.writeFile('/dev/stdout', data);
});
}
});
const linkHandlers = await page.$x('//div[#data-ga-category = "GeneHancer"]//a[#data-role = "show-all"]');
if (linkHandlers.length > 0) {
await Promise.all([
linkHandlers[0].click()
, page.waitForNavigation({waitUntil: 'networkidle2', timeout: timeout})
]);
} else {
throw new Error("Link not found");
}
} catch (e) {
console.error(e);
process.exit(1);
} finally {
await browser.close();
}
})();
I have the above main.js.
$ ./main.js cookies.json 'https://www.genecards.org/cgi-bin/carddisp.pl?gene=BSCL2' 30000
When I run using the above command, I got this error. Does anybody know how to fix the error? Thanks.
TimeoutError: Navigation Timeout Exceeded: 30000ms exceeded
at /usr/local/lib/node_modules/puppeteer/lib/LifecycleWatcher.js:142:21
-- ASYNC --
at Frame.<anonymous> (/usr/local/lib/node_modules/puppeteer/lib/helper.js:110:27)
at Page.waitForNavigation (/usr/local/lib/node_modules/puppeteer/lib/Page.js:649:49)
at Page.<anonymous> (/usr/local/lib/node_modules/puppeteer/lib/helper.js:111:23)
at main.js:33:12
at processTicksAndRejections (internal/process/task_queues.js:89:5) {
name: 'TimeoutError'
}
```
You will need to disable timeout by setting the timeout parameter to 0. The default is 30 seconds (which you are also passing as a command line argument from your example), so this is behaving as expected and throwing an exception because the timeout exceeded the default or user supplied value of 30000 milliseconds.
page.waitForNavigation({waitUntil: 'networkidle2', timeout: 0})
You can also pass the parameter from the command line as an argument, which is preferable so as not to hard-code the value:
$ ./main.js cookies.json 'https://www.genecards.org/cgi-bin/carddisp.pl?gene=BSCL2' 0
Related
I'm trying with this code to get the response body from a website using puppeteer
#!/usr/bin/env node
require('dotenv').config();
const puppeteer = require('puppeteer');
const readline = require('readline').createInterface({
input: process.stdin,
output: process.stdout
});
const path = require('path');
const fs = require('fs');
//
console.log('Starting Puppeteer...');
let responseBody = [];
(async () => {
const browser = await puppeteer.launch({
headless: false,
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (request) => {
request.continue();
});
//
page.on('requestfinished', async (request) => {
const response = await request.response();
const url = response.url();
// store chunks url
if( url.startsWith('https://audio-akp-quic-control-examplecdn-com.akamaized.net/audio/') ){
console.log(await response.buffer());
//responseBody.push(response.buffer());
}
});
//
await page.goto('https://accounts.examplecdn.com/login', {
waitUntil: ['load', 'networkidle2']
});
const emailField = await page.waitForSelector('#login-username', {timeout: 3000});
await emailField.type(process.env.EMAIL, {delay: 100});
const passwordField = await page.waitForSelector('#login-password', {timeout: 3000});
await passwordField.type(process.env.PASSWORD, {delay: 100});
const submitButton = await page.waitForSelector('#login-button', {timeout: 3000});
await submitButton.click();
//
const navigation = await page.waitForNavigation({ waitUntil: ['load', 'networkidle2'] });
//if( navigation.url().endsWith('status') ){
await page.goto('https://example.cdn.com/search', {
waitUntil: ['load', 'networkidle2']
}).then( async (response) => {
//console.log(response);
const cookieButton = await page.$('#onetrust-accept-btn-handler');
await cookieButton.click();
const searchField = await page.$('[data-testid="search-input"]');
await readline.question('What track do you want to search for?', (input) => {
console.log('answer:', input);
searchField.type(input).then( async () => {
await page.waitForXPath('//*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[4]').then( async (element) => {
element.focus().then( async () => {
// //*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[3]/button
const playButton = await page.waitForXPath('//*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[3]/button');
await playButton.click();
});
});
});
});
});
//}
})();
I'm having problem with it and this error will be logged and the script will terminate.
/Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:208
this._callbacks.set(id, { resolve, reject, error: new Error(), method });
^
Error: Protocol error (Network.getResponseBody): No resource with given identifier found
at /Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:208:63
at new Promise (<anonymous>)
at CDPSession.send (/Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:207:16)
at /Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/HTTPResponse.js:99:53
at runMicrotasks (<anonymous>)
at processTicksAndRejections (node:internal/process/task_queues:93:5)
at async /Users/dev/Desktop/test/index.js:40:25
what I need to do is to collect all the response body content when a certain url is called, then using ffmpeg I want to convert it back to a full length track. How I can solve the problem, is possible to get the response body of each request and then join all togheter?
My task is form submission with different data. So, I am using puppeteer and for of loop.
Code example:
const puppeteer = require('puppeteer')
const data = require('data.json') // ~30 products
(async () => {
try {
const browser = await puppeteer.launch({
headless: false,
ignoreHTTPSErrors: true,
defaultViewport: null,
});
const page2 = await browser.newPage();
await page2.goTo('mywebsite', {waitUntil: 'domcontentloaded'} )
for (let product of data) {
// Waiting for some selector, after that do something with it
await page2.waitForSelector("#someSelector", { visible: true });
await page2.type("#someSelector", product.someData);
//
//... A lot of code that similar to above is here
//
// Go back after all things done
await page2.waitFor(2000);
await page2.waitForSelector('[title="home"]', { visible: true });
await page2.click('[title="home"]', { clickCount: 1 });
counter++;
console.log(
`===========================================================================${counter}`
);
}
} catch (err) {
throw new Error(err);
}
})();
The problem is that this is works, but not always works, for example, a loop can work 15 times, and then fall off or go through a full cycle without failing.
The error is always the same:
UnhandledPromiseRejectionWarning: Error: TimeoutError: waiting for selector "#someSelector" failed: timeout 30000ms exceeded
However, if I check the page, then everything is there, the elements are on the page, but puppeteer does not see them. How Can I fix this?
My current solution for this is a retry function:
const chalk = require("chalk");
const util = require("util");
const delay = util.promisify(setTimeout);
async function retry(fn, retryDelay = 200, numRetries = 5) {
for (let i = 0; i < numRetries; i++) {
try {
return await fn();
} catch (err) {
console.log(
"$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$"
);
console.log(chalk.yellow(err));
if (i === numRetries - 1) throw err;
await delay(retryDelay);
retryDelay = retryDelay * 2;
}
}
}
Goal:
To hash page source in order to detect changes between scheduled scraping.
Python code:
import requests
import hashlib
url = 'http://example.org/'
r = requests.get(url, verify=False,)
r.encoding = 'utf-8'
print(hashlib.sha256(r.text.encode('utf-8')).hexdigest())
Result: ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
Node & Puppeteer code:
const puppeteer = require('puppeteer');
var crypto = require('crypto');
(async()=> {
const browser= await puppeteer.launch();
const page= await browser.newPage();
try {
const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 });
console.log(crypto.createHash('sha256').update(response.text().toString()).digest('hex'));
} catch (e) {
console.log(e.message);
}
await browser.close();
})();
Result: b4e6060006b920bc021110ea8ab8d67744983e2b7ff75e1c8be5613af93f687d
Questions:
Why is there a difference? As far as I inspected, both methods
return the same response.
Can I get same results?
Are there a better ways to detect changes in page content?
You need to await the response text in puppeteer, otherwise you are hashing the stringified version of Promise { <pending> }
const puppeteer = require('puppeteer');
var crypto = require('crypto');
(async()=> {
const browser= await puppeteer.launch();
const page= await browser.newPage();
try {
const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 });
const source = await response.text();
console.log(crypto.createHash('sha256').update(source).digest('hex'));
} catch (e) {
console.log(e.message);
}
await browser.close();
})();
Output:
python c.py
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
node c.js
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
I'm trying to grab some html in for expression, but somehow I'm getting error
Error: Evaluation failed: ReferenceError: link is not defined
at __puppeteer_evaluation_script__:8:29
at ExecutionContext.evaluateHandle (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\ExecutionContext.js:124:13)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at ExecutionContext.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:144:27)
at ExecutionContext.evaluate (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\ExecutionContext.js:58:31)
at ExecutionContext.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:145:23)
at Frame.evaluate (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\FrameManager.js:447:20)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at Frame.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:144:27)
at Page.evaluate (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\Page.js:777:43)
at Page.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:145:23)
at zrGrabber.StartGrabbingHtml (C:\Repositories\auto-grabber-server\grabbers\zr.grabber.js:52:40)
at process._tickCallback (internal/process/next_tick.js:68:7)
Link has been passed to the StartGrabbingHtml function, but then I,m getting mentioned error. I suppose that something is wrong with async staff, but can't get what exactly.
const puppeteer = require("puppeteer");
let links = [];
const Mongo = require('./../db/mongo');
const zrLinks = [
"https://www.zr.ru/stories/consultant/optimalno/",
"https://www.zr.ru/news/avtomobili/",
"https://www.zr.ru/stories/prezentaciya-car/new/"
];
module.exports = class zrGrabber {
async startGrabbingLinks() {
try {
for (let i = 0; i < zrLinks.length; i++) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(zrLinks[i], {
waitUntil: 'load',
timeout: 0
});
const result = await page.evaluate(() => {
const links = document.querySelectorAll('div.head > h2 > a')
return [...links].map(link => link.href);
});
await page.close();
await browser.close();
links = [...links, ...result];
}
const db = new Mongo();
for (let i = 0; i < links.length; i++) {
// if link already in database skip grabbing
const found = await db.findLink(links[i]);
if (found) {
continue;
}
// else grab and write link to database
await this.StartGrabbingHtml(links[i])
}
} catch (err) {
console.log(err)
}
}
async StartGrabbingHtml(link) {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(link, {
waitUntil: 'load',
timeout: 0
});
const article = await page.evaluate(() => { // error throwing here
const date = document.querySelector('#storyDetailArticle > time').innerHTML;
const name = document.querySelector('#storyDetailArticle > h1').innerHTML;
const description = document.querySelector('#storyDetailArticle > div.stroy_announcement > h3').innerHTML;
const author = document.querySelector('#storyDetailArticle > div.announcement_author.story_author.no_preview > div').innerHTML;
const content = document.querySelector('#storyDetailArticle > div.stroy_content').innerHTML;
return {
source: link,
date: date,
name: name,
description: description,
author: author,
content: content
};
});
console.log(article)
const db = new Mongo();
await db.insertOne(article);
await page.close();
await browser.close();
} catch (err) {
console.log(err)
}
}
}
What I'm doing wrong here?
The script cannot access the variable link from inside the page.evaluate context.
You should pass it as an argument like this:
await page.evaluate(link => {
// ...
}, link);
Hi I am trying to make a screenshot service
const puppeteer = require('puppeteer');
var resWidth = 1366;
var resHeight = 1000;
var browser;
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
and when I receive a work I try to do
data.forEach(function(d){
try {
console.log(d["url"]);
(async () => {
var page = await browser.newPage();
await page.setViewport({width: resWidth, height: resHeight});
await page.goto(d["url"], {timeout: 90000, waitUntil: 'networkidle'});
await page.screenshot({path: './picdata/' + d['id'] + '.png' ,fullPage: true});
await page.close();
})();
} catch(e) {}
});
but I can't... here is the error:
(node:40596) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 7): TypeError: Cannot read property 'newPage' of undefined
I don't want to open a new browser for each screenshot launching browser takes time and requires more memory?
what should I do?
The problem:
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
This code never gets executed. Why? because it's not a true closure.
More on closures, here.
That being said, that wont have work for your given scenario, as they are async tasks.
My try with your example:
'use strict';
const puppeteer = require('puppeteer');
const resWidth = 1366;
const resHeight = 1000;
let browser;
let page;
async function launchBrowser() {
browser = await puppeteer.launch({ headless: true }); //this "{ headless: true }" will ensure a browser window is not open a single time.
};
launchBrowser().then((x) => { // wait until browser has launched
data.forEach(async (d) => {
try {
page = await browser.newPage(); // the document is not very clear about this method, so I am still unsure if I could move this in the above launchBrowser() method.
await page.setViewport({ width: resWidth, height: resHeight });
await page.goto(d['url'], { timeout: 90000, waitUntil: 'networkidle' });
await page.screenshot({ path: './picdata/' + d['id'] + '.png', fullPage: true });
}
catch (e) {
console.log(e);
await browser.close(); // close browser if there is an error
}
});
})
.then(() => {
await browser.close(); // close browser finally.
});
Pro Tip: Start using let, const instead of var.
There is a great article on this, here