Puppeteer - How to wait img after change innerHTML? - node.js

I would like to add an image before a screenshot with puppeteer.
The following code works but instead of waiting like this, I would like to wait until the img is here :
element.innerHTML = "<img id=\"logo_website\" src=\"http://random.com/logo.jpg\">";
await page.waitFor(2000)
I tried with the following "waitFor" but it doesn't work.
await page.waitFor("#logo_website")

You can try page.waitForResponse() in this way:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
await Promise.all([
page.waitForResponse('https://www.iana.org/_img/2013.1/iana-logo-header.svg'),
page.evaluate(() => {
document.body.innerHTML = '<img id="logo_website" src="https://www.iana.org/_img/2013.1/iana-logo-header.svg">';
}),
]);
await page.screenshot({ path: 'scr.png' });
await browser.close();
} catch (err) {
console.error(err);
}
})();

Related

How to get the quantity of children inside the element using xpath query on the puppeter?

I need to get the number of children inside an element with the page.$x(xpath) function in Puppeteer, mad I was not successful. In the browser console I can do it using $x().children.length. What would be the best way to do this in the puppeteer with node?
You can use either element/JS handle API or pure Web API in page.evaluate(). These are both wways:
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
const [element] = await page.$x('//body/div');
const children = await element.getProperty('children');
const length1 = await (await children.getProperty('length')).jsonValue();
console.log(length1); // 3
const length2 = await page.evaluate(() => {
return document.evaluate(
'//body/div', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE
).singleNodeValue.children.length;
});
console.log(length2); // 3
await browser.close();
} catch (err) {
console.error(err);
}
})();

Difference when hashing page source with Node and Python

Goal:
To hash page source in order to detect changes between scheduled scraping.
Python code:
import requests
import hashlib
url = 'http://example.org/'
r = requests.get(url, verify=False,)
r.encoding = 'utf-8'
print(hashlib.sha256(r.text.encode('utf-8')).hexdigest())
Result: ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
Node & Puppeteer code:
const puppeteer = require('puppeteer');
var crypto = require('crypto');
(async()=> {
const browser= await puppeteer.launch();
const page= await browser.newPage();
try {
const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 });
console.log(crypto.createHash('sha256').update(response.text().toString()).digest('hex'));
} catch (e) {
console.log(e.message);
}
await browser.close();
})();
Result: b4e6060006b920bc021110ea8ab8d67744983e2b7ff75e1c8be5613af93f687d
Questions:
Why is there a difference? As far as I inspected, both methods
return the same response.
Can I get same results?
Are there a better ways to detect changes in page content?
You need to await the response text in puppeteer, otherwise you are hashing the stringified version of Promise { <pending> }
const puppeteer = require('puppeteer');
var crypto = require('crypto');
(async()=> {
const browser= await puppeteer.launch();
const page= await browser.newPage();
try {
const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 });
const source = await response.text();
console.log(crypto.createHash('sha256').update(source).digest('hex'));
} catch (e) {
console.log(e.message);
}
await browser.close();
})();
Output:
python c.py
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
node c.js
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9

can we scrape data for a website which requires authentication in Node.js?

I am trying to web scraping using node.js for a website that requires authentication. Is there any way to perform this in node.js?
You can try puppeteer:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
await page.goto('https://httpbin.org/forms/post');
await page.type('input[name="custname"]', 'user');
await page.type('input[name="custemail"]', 'user#example.com');
await Promise.all([
page.click('button'),
page.waitForNavigation(),
]);
await page.waitForSelector('pre');
const data = await page.evaluate(() => {
return document.querySelector('pre').innerText;
});
console.log(JSON.parse(data).form.custemail);
await browser.close();
} catch (err) {
console.error(err);
}
})();
===============================
For the side from the comment:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
page.setDefaultTimeout(0);
await page.goto('https://www.trxade.com/market/login');
await page.waitForSelector('input[name="deaNumber"]');
await page.type('input[name="deaNumber"]', '...');
await page.type('input[name="password"]', '...');
await Promise.all([
page.click('input[name="form_login_proceed"]'),
page.waitForNavigation(),
]);
// await browser.close();
} catch (err) {
console.error(err);
}
})();

I would like to know how to use $$eval from Puppeteer

I can not use $$ eval well.
(async() => {
const browser = await puppeteer.launch({ executablePath: chrome ,args: [chromeArgs]});
const page = await browser.newPage();
await page.goto('https://www.example.com/', {waitUntil: "domcontentloaded"});
var links = await page.evaluate(() => {
var hreflist = [];
var tags = document.querySelectorAll("p");
Array.prototype.forEach.call(tags, (tag)=>{
hreflist.push(tag.textContent);
});
return hreflist;
});
console.log(util.inspect(links, false, null));
browser.close();
})();
I would like to do the same thing as the source code written above.
(async() => {
const browser = await puppeteer.launch({ executablePath: chrome ,args: [chromeArgs]});
const page = await browser.newPage();
await page.goto('https://www.example.com/', {waitUntil: "domcontentloaded"});
var links = await page.$$eval('p', list => {
list.map(data => {
data.textContent
})
});
console.log(util.inspect(links, false, null));
browser.close();
})();
The execution result of $$eval() is undefined.
https://pptr.dev/#?product=Puppeteer&version=v1.10.0&show=api-pageevalselector-pagefunction-args
I saw the official document.
However, we can not confirm the problem.
You forgot to return the value. this will work
var links = await page.$$eval('p', list => list.map(data => data.textContent));

use same browser instance?

Hi I am trying to make a screenshot service
const puppeteer = require('puppeteer');
var resWidth = 1366;
var resHeight = 1000;
var browser;
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
and when I receive a work I try to do
data.forEach(function(d){
try {
console.log(d["url"]);
(async () => {
var page = await browser.newPage();
await page.setViewport({width: resWidth, height: resHeight});
await page.goto(d["url"], {timeout: 90000, waitUntil: 'networkidle'});
await page.screenshot({path: './picdata/' + d['id'] + '.png' ,fullPage: true});
await page.close();
})();
} catch(e) {}
});
but I can't... here is the error:
(node:40596) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 7): TypeError: Cannot read property 'newPage' of undefined
I don't want to open a new browser for each screenshot launching browser takes time and requires more memory?
what should I do?
The problem:
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
This code never gets executed. Why? because it's not a true closure.
More on closures, here.
That being said, that wont have work for your given scenario, as they are async tasks.
My try with your example:
'use strict';
const puppeteer = require('puppeteer');
const resWidth = 1366;
const resHeight = 1000;
let browser;
let page;
async function launchBrowser() {
browser = await puppeteer.launch({ headless: true }); //this "{ headless: true }" will ensure a browser window is not open a single time.
};
launchBrowser().then((x) => { // wait until browser has launched
data.forEach(async (d) => {
try {
page = await browser.newPage(); // the document is not very clear about this method, so I am still unsure if I could move this in the above launchBrowser() method.
await page.setViewport({ width: resWidth, height: resHeight });
await page.goto(d['url'], { timeout: 90000, waitUntil: 'networkidle' });
await page.screenshot({ path: './picdata/' + d['id'] + '.png', fullPage: true });
}
catch (e) {
console.log(e);
await browser.close(); // close browser if there is an error
}
});
})
.then(() => {
await browser.close(); // close browser finally.
});
Pro Tip: Start using let, const instead of var.
There is a great article on this, here

Resources