Difference when hashing page source with Node and Python - node.js

Goal:
To hash page source in order to detect changes between scheduled scraping.
Python code:
import requests
import hashlib
url = 'http://example.org/'
r = requests.get(url, verify=False,)
r.encoding = 'utf-8'
print(hashlib.sha256(r.text.encode('utf-8')).hexdigest())
Result: ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
Node & Puppeteer code:
const puppeteer = require('puppeteer');
var crypto = require('crypto');
(async()=> {
const browser= await puppeteer.launch();
const page= await browser.newPage();
try {
const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 });
console.log(crypto.createHash('sha256').update(response.text().toString()).digest('hex'));
} catch (e) {
console.log(e.message);
}
await browser.close();
})();
Result: b4e6060006b920bc021110ea8ab8d67744983e2b7ff75e1c8be5613af93f687d
Questions:
Why is there a difference? As far as I inspected, both methods
return the same response.
Can I get same results?
Are there a better ways to detect changes in page content?

You need to await the response text in puppeteer, otherwise you are hashing the stringified version of Promise { <pending> }
const puppeteer = require('puppeteer');
var crypto = require('crypto');
(async()=> {
const browser= await puppeteer.launch();
const page= await browser.newPage();
try {
const response = await page.goto('http://example.org/', { waitUntil: 'domcontentloaded', timeout: 30000 });
const source = await response.text();
console.log(crypto.createHash('sha256').update(source).digest('hex'));
} catch (e) {
console.log(e.message);
}
await browser.close();
})();
Output:
python c.py
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9
node c.js
ea8fac7c65fb589b0d53560f5251f74f9e9b243478dcb6b3ea79b5e36449c8d9

Related

puppeteer - Error: Protocol error (Network.getResponseBody): No resource with given identifier found

I'm trying with this code to get the response body from a website using puppeteer
#!/usr/bin/env node
require('dotenv').config();
const puppeteer = require('puppeteer');
const readline = require('readline').createInterface({
input: process.stdin,
output: process.stdout
});
const path = require('path');
const fs = require('fs');
//
console.log('Starting Puppeteer...');
let responseBody = [];
(async () => {
const browser = await puppeteer.launch({
headless: false,
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (request) => {
request.continue();
});
//
page.on('requestfinished', async (request) => {
const response = await request.response();
const url = response.url();
// store chunks url
if( url.startsWith('https://audio-akp-quic-control-examplecdn-com.akamaized.net/audio/') ){
console.log(await response.buffer());
//responseBody.push(response.buffer());
}
});
//
await page.goto('https://accounts.examplecdn.com/login', {
waitUntil: ['load', 'networkidle2']
});
const emailField = await page.waitForSelector('#login-username', {timeout: 3000});
await emailField.type(process.env.EMAIL, {delay: 100});
const passwordField = await page.waitForSelector('#login-password', {timeout: 3000});
await passwordField.type(process.env.PASSWORD, {delay: 100});
const submitButton = await page.waitForSelector('#login-button', {timeout: 3000});
await submitButton.click();
//
const navigation = await page.waitForNavigation({ waitUntil: ['load', 'networkidle2'] });
//if( navigation.url().endsWith('status') ){
await page.goto('https://example.cdn.com/search', {
waitUntil: ['load', 'networkidle2']
}).then( async (response) => {
//console.log(response);
const cookieButton = await page.$('#onetrust-accept-btn-handler');
await cookieButton.click();
const searchField = await page.$('[data-testid="search-input"]');
await readline.question('What track do you want to search for?', (input) => {
console.log('answer:', input);
searchField.type(input).then( async () => {
await page.waitForXPath('//*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[4]').then( async (element) => {
element.focus().then( async () => {
// //*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[3]/button
const playButton = await page.waitForXPath('//*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[3]/button');
await playButton.click();
});
});
});
});
});
//}
})();
I'm having problem with it and this error will be logged and the script will terminate.
/Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:208
this._callbacks.set(id, { resolve, reject, error: new Error(), method });
^
Error: Protocol error (Network.getResponseBody): No resource with given identifier found
at /Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:208:63
at new Promise (<anonymous>)
at CDPSession.send (/Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:207:16)
at /Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/HTTPResponse.js:99:53
at runMicrotasks (<anonymous>)
at processTicksAndRejections (node:internal/process/task_queues:93:5)
at async /Users/dev/Desktop/test/index.js:40:25
what I need to do is to collect all the response body content when a certain url is called, then using ffmpeg I want to convert it back to a full length track. How I can solve the problem, is possible to get the response body of each request and then join all togheter?

How to get the quantity of children inside the element using xpath query on the puppeter?

I need to get the number of children inside an element with the page.$x(xpath) function in Puppeteer, mad I was not successful. In the browser console I can do it using $x().children.length. What would be the best way to do this in the puppeteer with node?
You can use either element/JS handle API or pure Web API in page.evaluate(). These are both wways:
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
const [element] = await page.$x('//body/div');
const children = await element.getProperty('children');
const length1 = await (await children.getProperty('length')).jsonValue();
console.log(length1); // 3
const length2 = await page.evaluate(() => {
return document.evaluate(
'//body/div', document, null, XPathResult.FIRST_ORDERED_NODE_TYPE
).singleNodeValue.children.length;
});
console.log(length2); // 3
await browser.close();
} catch (err) {
console.error(err);
}
})();

Puppeteer - How to wait img after change innerHTML?

I would like to add an image before a screenshot with puppeteer.
The following code works but instead of waiting like this, I would like to wait until the img is here :
element.innerHTML = "<img id=\"logo_website\" src=\"http://random.com/logo.jpg\">";
await page.waitFor(2000)
I tried with the following "waitFor" but it doesn't work.
await page.waitFor("#logo_website")
You can try page.waitForResponse() in this way:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch();
const [page] = await browser.pages();
await page.goto('https://example.org/');
await Promise.all([
page.waitForResponse('https://www.iana.org/_img/2013.1/iana-logo-header.svg'),
page.evaluate(() => {
document.body.innerHTML = '<img id="logo_website" src="https://www.iana.org/_img/2013.1/iana-logo-header.svg">';
}),
]);
await page.screenshot({ path: 'scr.png' });
await browser.close();
} catch (err) {
console.error(err);
}
})();

Scraper with Puppeteer login returns just one element of the array

This code is supposed to loop through the urls that get scraped from the scrapeProductPage function. But before looping, it needs to log in so that it can obtain the prices. The prices are only displayed to logged in users. Instead of looping through the urls it just returns the scraped data from one page. The error I get is "MaxListenersExceededWarning: Possible EventEmitter memory leak detected".
const request = require("request-promise");
const cheerio = require("cheerio");
const ObjectsToCsv = require("objects-to-csv");
const puppeteer = require('puppeteer');
const url = "https://www.example.com";
const scrapeResults = [];
async function scrapeProductPage() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("td.productListing-data > a[style='position:relative;float:left;']").each((index, element) => {
let url = $(element).attr("href");
url = "https\://www.example.com/" + url;
const scrapeResult = { url };
scrapeResults.push(scrapeResult);
});
return scrapeResults;
} catch (err) {
console.error(err);
}
}
async function scrapeDescription(productsWithImages) {
process.setMaxListeners(0);
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.example.com/login');
await page.waitFor(500);
await page.waitFor('input[name="email_address"]');
await page.type('input[name="email_address"]', 'example#gmail.com');
await page.type('input[name="password"]', '123test');
await page.click('#btnLogin');
return await Promise.all(
productsWithImages.map(async job => {
try {
await page.goto(job.url, { waitUntil: "load" });
const content = await page.content();
const $ = await cheerio.load(content);
job.main_img = $('img#main_img').attr('src');
job.name = $('h2').text();
job.price = $("td.products_info_price").text();
return job;
} catch (error) {
console.error(error);
}
})
);
}
async function saveDataToCsv(data) {
const csv = new ObjectsToCsv(data);
console.log(csv);
}
async function scrapeWona() {
const productsWithImages = await scrapeProductPage();
const wonaFullData = await scrapeDescription(productsWithImages);
await saveDataToCsv(productsWithImages);
}
scrapeWona();
The reason you're getting the warning is because of process.setMaxListeners(0)
Indicates you have a memory leak somewhere in the code.
You can take a look at the documentation here also: https://nodejs.org/docs/latest/api/events.html#events_emitter_setmaxlisteners_n
Take a look at the answer from here: node.js - request - How to "emitter.setMaxListeners()"?

can we scrape data for a website which requires authentication in Node.js?

I am trying to web scraping using node.js for a website that requires authentication. Is there any way to perform this in node.js?
You can try puppeteer:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
await page.goto('https://httpbin.org/forms/post');
await page.type('input[name="custname"]', 'user');
await page.type('input[name="custemail"]', 'user#example.com');
await Promise.all([
page.click('button'),
page.waitForNavigation(),
]);
await page.waitForSelector('pre');
const data = await page.evaluate(() => {
return document.querySelector('pre').innerText;
});
console.log(JSON.parse(data).form.custemail);
await browser.close();
} catch (err) {
console.error(err);
}
})();
===============================
For the side from the comment:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
page.setDefaultTimeout(0);
await page.goto('https://www.trxade.com/market/login');
await page.waitForSelector('input[name="deaNumber"]');
await page.type('input[name="deaNumber"]', '...');
await page.type('input[name="password"]', '...');
await Promise.all([
page.click('input[name="form_login_proceed"]'),
page.waitForNavigation(),
]);
// await browser.close();
} catch (err) {
console.error(err);
}
})();

Resources