Can someone help me understand why doesn't the product data get printed out? I'm currently using puppeteer to scrape a website for product data.
const puppeteer = require("puppeteer");
(async () => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
//link to page that i want to scrape
await page.goto(
"link link",
{ waitUntil: "networkidle2" }
);
var data = await page
.evaluate(() => {
var productData = {};
productData["brand"] = document.querySelector(
"a.designer-name > span"
).textContent;
console.log("mimo");
return productData;
})
.catch(err => {
console.log(err);
});
console.log(data);
await browser.close();
})();
you are using promise and callback together. If you instead return a promise from the page.evaluate, it should work.
thanks to #tehhowch.
var data = await page
.evaluate(async () => {
return await new Promise(resolve => { // <-- return the data to node.js from browser
var productData = {};
productData["brand"] = document.querySelector(
"a.designer-name > span"
).textContent;
console.log("mimo");
resolve(productData);
});
})
.catch(err => {
console.log(err);
});
console.log(data);
Related
I'm trying with this code to get the response body from a website using puppeteer
#!/usr/bin/env node
require('dotenv').config();
const puppeteer = require('puppeteer');
const readline = require('readline').createInterface({
input: process.stdin,
output: process.stdout
});
const path = require('path');
const fs = require('fs');
//
console.log('Starting Puppeteer...');
let responseBody = [];
(async () => {
const browser = await puppeteer.launch({
headless: false,
executablePath: '/Applications/Google Chrome.app/Contents/MacOS/Google Chrome'
});
const page = await browser.newPage();
await page.setRequestInterception(true);
page.on('request', (request) => {
request.continue();
});
//
page.on('requestfinished', async (request) => {
const response = await request.response();
const url = response.url();
// store chunks url
if( url.startsWith('https://audio-akp-quic-control-examplecdn-com.akamaized.net/audio/') ){
console.log(await response.buffer());
//responseBody.push(response.buffer());
}
});
//
await page.goto('https://accounts.examplecdn.com/login', {
waitUntil: ['load', 'networkidle2']
});
const emailField = await page.waitForSelector('#login-username', {timeout: 3000});
await emailField.type(process.env.EMAIL, {delay: 100});
const passwordField = await page.waitForSelector('#login-password', {timeout: 3000});
await passwordField.type(process.env.PASSWORD, {delay: 100});
const submitButton = await page.waitForSelector('#login-button', {timeout: 3000});
await submitButton.click();
//
const navigation = await page.waitForNavigation({ waitUntil: ['load', 'networkidle2'] });
//if( navigation.url().endsWith('status') ){
await page.goto('https://example.cdn.com/search', {
waitUntil: ['load', 'networkidle2']
}).then( async (response) => {
//console.log(response);
const cookieButton = await page.$('#onetrust-accept-btn-handler');
await cookieButton.click();
const searchField = await page.$('[data-testid="search-input"]');
await readline.question('What track do you want to search for?', (input) => {
console.log('answer:', input);
searchField.type(input).then( async () => {
await page.waitForXPath('//*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[4]').then( async (element) => {
element.focus().then( async () => {
// //*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[3]/button
const playButton = await page.waitForXPath('//*[#id="searchPage"]/div/div/section[1]/div[2]/div/div/div/div[3]/button');
await playButton.click();
});
});
});
});
});
//}
})();
I'm having problem with it and this error will be logged and the script will terminate.
/Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:208
this._callbacks.set(id, { resolve, reject, error: new Error(), method });
^
Error: Protocol error (Network.getResponseBody): No resource with given identifier found
at /Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:208:63
at new Promise (<anonymous>)
at CDPSession.send (/Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/Connection.js:207:16)
at /Users/dev/Desktop/test/node_modules/puppeteer/lib/cjs/puppeteer/common/HTTPResponse.js:99:53
at runMicrotasks (<anonymous>)
at processTicksAndRejections (node:internal/process/task_queues:93:5)
at async /Users/dev/Desktop/test/index.js:40:25
what I need to do is to collect all the response body content when a certain url is called, then using ffmpeg I want to convert it back to a full length track. How I can solve the problem, is possible to get the response body of each request and then join all togheter?
I'm trying to load a page with a canvas and then save it as an image.
For example, this page. On Chrome, I can right click the canvas with a circle on the upper right side of the page and click save image. I want to do this exact same thing but through NodeJS and Puppeteer. Is this possible?
So far I'm trying to select it via
const express = require('express')
const router = express.Router()
const puppeteer = require('puppeteer')
const { Cluster } = require('puppeteer-cluster')
(async () => {
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 2,
})
function sleep(ms) {
return new Promise(resolve => setTimeout(resolve, ms));
}
await cluster.task(async({ page, data: url }) => {
// let starmapId = 'celestial-canvas'
await page.goto(url)
const canvas = await page.evaluate(() => document.querySelector('#myCanvas'))
return canvas // .toDataURL()
})
router.get('/export/canvas', function(req, res) {
// Get URL
var url = 'https://www.w3schools.com/html/tryit.asp?filename=tryhtml5_canvas_tut_path2'
cluster.execute(url).then( canvas => {
console.log(canvas)
res.send(canvas)
})
})
})();
module.exports = router
But canvas is returning null.
In your example, the canvas is inside an iframe. So you need to get the frame first, then you will able to transfer the string with the data URL:
import puppeteer from 'puppeteer';
const browser = await puppeteer.launch(/* { headless: false, defaultViewport: null } */);
try {
const [page] = await browser.pages();
await page.goto('https://www.w3schools.com/html/tryit.asp?filename=tryhtml5_canvas_tut_path2');
const frame = await (await page.$('#iframeResult')).contentFrame();
const data = await frame.evaluate(() => {
return document.querySelector('#myCanvas').toDataURL();
});
console.log(data); // data:image/png;base64,iVBORw0K...
} catch(err) { console.error(err); } finally { await browser.close(); }
I have a function which scrapes an element and return the element value. This is the code of reale-scraper.js:
module.exports.RealeScraper = function() {
return new Promise((res, rej) => {
var url = 'example.com';
var compagnia;
//Start Puppeteer and scrape element
ptr.launch().then(async browser => {
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 800 });
await page.goto(url, {waitUntil: "networkidle0"});
await page.type('input[name="username"]', config.utente);
await page.type('input[name="password"]', config.pass);
await Promise.all([
page.click('input[type="SUBMIT"]'),
page.waitForNavigation({waitUntil: 'networkidle2'})
]);
await page.waitForSelector('#tableSceltaProfilo > tbody > tr:nth-child(1) > td:nth-child(2)');
const element = await page.$("#tableSceltaProfilo > tbody > tr:nth-child(1) > td:nth-child(2)");
compagnia = await page.evaluate(element => element.textContent, element);
await page.screenshot({path: 'screenshot.png'});
await browser.close();
});
res(compagnia);
});
}
Then i call that function and try to send data to my ejs template in home.js:
var scraper = require('../scrapers/reale-scraper');
router.get('/home', function(req, res, next) {
RealeScraper().then((compagnia) => {
res.render('nuovo-sinistro', {
titolo: 'Manager Perizie',
compagnia: compagnia
});
}).catch((error) => {
console.log(error);
});
});
I want to wait until 'RealeScraper' is finished and returned me a value so that I can pass it to res.render. I've tried using Promise but it doesn't work. It gives me no errors but when I load the page, the function doesn't start and so is rendered without the variable.
I've also tried different methods but ended up having the page loading forever.
Any help would be really appreciated, thanks!
You call //Start Puppeteer and scrape element and res(compagnia); at the same time, while compagnia is empty, it has been returned.
Just call res when scrape element finished.
...
await browser.close();
res(compagnia);
...
I think it will be better if you only use async\await like this:
module.exports.RealeScraper = async function () {
var url = 'example.com';
var compagnia;
//Start Puppeteer and scrape element
let browser = await ptr.launch();
const page = await browser.newPage();
await page.setViewport({ width: 1280, height: 800 });
await page.goto(url, { waitUntil: "networkidle0" });
await page.type('input[name="username"]', config.utente);
await page.type('input[name="password"]', config.pass);
await page.click('input[type="SUBMIT"]'); // why you do that in parallel?
await page.waitForNavigation({ waitUntil: 'networkidle2' });
await page.waitForSelector('#tableSceltaProfilo > tbody > tr:nth-child(1) > td:nth-child(2)');
const element = await page.$("#tableSceltaProfilo > tbody > tr:nth-child(1) > td:nth-child(2)");
compagnia = await page.evaluate(element => element.textContent, element);
await page.screenshot({ path: 'screenshot.png' });
await browser.close();
return compagnia;
}
// ...
var scraper = require('../scrapers/reale-scraper');
router.get('/home', async function (req, res, next) {
try {
let compagnia = await RealeScraper();
res.render('nuovo-sinistro', {
titolo: 'Manager Perizie',
compagnia: compagnia
});
} catch (error) {
console.log(error);
}
});
Hi Guys I want to log in a website and once authenticated want to loop through a given set of URLS and scrape data. What I intend to do can be described by this example,however I get Unhandled promise rejection.
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
console.log(title);
}
catch(e) {
console.log(e)
}
return title
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
getTitle(page, url)
}
await browser.close();
})();
There are multiple issues in this example.
You should await the call to function getTitle, you re awaiting inside the function but you have to await the call to the function too.
You should surround getTitle with a try and catch block and check inside the function if theres a title to return (ex. the title for google is null)
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
if(title){
return title
}
}
catch(e) {
throw(e)
console.log(e)
}
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
try{
console.log(await getTitle(page, url))
}
catch(e ){
console.log('No title')
}
}
await browser.close();
})();
I am trying to web scraping using node.js for a website that requires authentication. Is there any way to perform this in node.js?
You can try puppeteer:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
await page.goto('https://httpbin.org/forms/post');
await page.type('input[name="custname"]', 'user');
await page.type('input[name="custemail"]', 'user#example.com');
await Promise.all([
page.click('button'),
page.waitForNavigation(),
]);
await page.waitForSelector('pre');
const data = await page.evaluate(() => {
return document.querySelector('pre').innerText;
});
console.log(JSON.parse(data).form.custemail);
await browser.close();
} catch (err) {
console.error(err);
}
})();
===============================
For the side from the comment:
'use strict';
const puppeteer = require('puppeteer');
(async function main() {
try {
const browser = await puppeteer.launch({ headless: false });
const [page] = await browser.pages();
page.setDefaultTimeout(0);
await page.goto('https://www.trxade.com/market/login');
await page.waitForSelector('input[name="deaNumber"]');
await page.type('input[name="deaNumber"]', '...');
await page.type('input[name="password"]', '...');
await Promise.all([
page.click('input[name="form_login_proceed"]'),
page.waitForNavigation(),
]);
// await browser.close();
} catch (err) {
console.error(err);
}
})();