I would like to know how to use $$eval from Puppeteer - node.js

I can not use $$ eval well.
(async() => {
const browser = await puppeteer.launch({ executablePath: chrome ,args: [chromeArgs]});
const page = await browser.newPage();
await page.goto('https://www.example.com/', {waitUntil: "domcontentloaded"});
var links = await page.evaluate(() => {
var hreflist = [];
var tags = document.querySelectorAll("p");
Array.prototype.forEach.call(tags, (tag)=>{
hreflist.push(tag.textContent);
});
return hreflist;
});
console.log(util.inspect(links, false, null));
browser.close();
})();
I would like to do the same thing as the source code written above.
(async() => {
const browser = await puppeteer.launch({ executablePath: chrome ,args: [chromeArgs]});
const page = await browser.newPage();
await page.goto('https://www.example.com/', {waitUntil: "domcontentloaded"});
var links = await page.$$eval('p', list => {
list.map(data => {
data.textContent
})
});
console.log(util.inspect(links, false, null));
browser.close();
})();
The execution result of $$eval() is undefined.
https://pptr.dev/#?product=Puppeteer&version=v1.10.0&show=api-pageevalselector-pagefunction-args
I saw the official document.
However, we can not confirm the problem.

You forgot to return the value. this will work
var links = await page.$$eval('p', list => list.map(data => data.textContent));

Related

How do I scrape images from udemy using NodeJS and puppeteer

This is my code - scraping courses titles works ok, but I have problem with images
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin());
const fs = require('fs/promises')
function sleep(ms)
{
return new Promise(resolve => setTimeout(resolve, ms));
}
async function start()
{
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto("https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL");
await sleep(5000);
const names = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".course-list--container--3zXPS div.udlite-focus-visible-target.udlite-heading-md.course-card--course-title--vVEjC")).map(x => x.textContent)
})
const images = await page.evaluate(() => {
return Array.from(
document.querySelectorAll(".course-list--container--3zXPS div.course-card--image-wrapper--1F9ny")
).map((image) => image.getAttribute(`src`));
});
let m = ";";
for (let i = 0; i < names.length; i++)
{
names[i] = i+m+names[i]+m+images[i]
}
await fs.writeFile("courses.txt", names.join("\r\n"))
await page.screenshot({ path: "udemy.png", fullPage: true });
await browser.close();
}
start()
Now it returns null instead images url, if I change src to srcset nothing changes.
The page that I want to scrape the images from is https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL
On a screenshot that this script takes I can see that courses icons are blacked out. I can scrape images that are visible on screenshot, but not those that are blacked out.
Ok I found the answer - I added setVievport function and img at the end of QuerySelectorAll
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin());
const fs = require('fs/promises')
function sleep(ms)
{
return new Promise(resolve => setTimeout(resolve, ms));
}
async function start()
{
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL", { "waitUntil": "networkidle0" });
await sleep(1000);
const bodyWidth = await page.evaluate(() => document.body.scrollWidth);
const bodyHeight = await page.evaluate(() => document.body.scrollHeight);
await page.setViewport({ width: bodyWidth, height: bodyHeight });
await sleep(1000);
const names = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".course-list--container--3zXPS div.udlite-focus-visible-target.udlite-heading-md.course-card--course-title--vVEjC")).map(x => x.textContent)
})
const images = await page.evaluate(() => {
return Array.from(
document.querySelectorAll(".course-list--container--3zXPS div.course-card--image-wrapper--1F9ny img")
).map((image) => image.getAttribute(`src`));
});
let m = ";";
for (let i = 0; i < names.length; i++)
{
names[i] = i+m+names[i]+m+images[i]
}
await fs.writeFile("courses.txt", names.join("\r\n"))
await page.screenshot({ path: "udemy.png", fullPage: true });
await browser.close();
}
start()

Loop not working in nodejs puppeteer script

New to nodejs and puppeteer. I'm trying to loop through some street names in a public county property value search. However, I cannot see what I'm doing wrong. I've had this working for a individual street name before I attempted to do a loop of street names. I've replaced the street names for protection.
const puppeteer = require('puppeteer');
var street_names = ["street1","street2","street3"]
for (var i = 0; i < street_names.length;i++) {
// console.log(street_names[i]); // Used to test if the loop works.
(async () => {
const browser = await puppeteer.launch({executablePath: '/usr/bin/chromium-browser'});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setDefaultNavigationTimeout(0);
// Property Search Page
await page.goto('http://propaccess.traviscad.org/clientdb/PropertySearch.aspx?cid=1', {waitUntil: 'domcontentloaded'});
//type the enter street
await page.select('select[name="propertySearchOptions:recordsPerPage"]', '250'); // Select 250 results per page
await page.screenshot({path: 'screenshot.jpg', fullPage: true});
await page.focus('#propertySearchOptions_streetName');
await page.type('input[name="propertySearchOptions:streetName"]',street_names[i]);
//await page.keyboard.type('street_names[i]');
await page.click('#propertySearchOptions_searchAdv');
// Enter Results Page
await page.screenshot({path: 'street_names[i]_screenshot.jpg', fullPage: true});
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: './results'});
await page.waitForSelector('#footer');
await page.click('#propertySearchResults_exportResults');
await page.waitForTimeout (3500);
await page.screenshot({path: 'screenshot.jpg', fullPage: true});
await browser.close();
process.exit(1);
});
}
You forgot to call (to add () after) the defined async function.
It would be more efficient to open the browser once and then reuse it with its page. To do so, you can place the loop inside the async function.
const puppeteer = require('puppeteer');
var street_names = ["street1","street2","street3"]
(async () => {
const browser = await puppeteer.launch({executablePath: '/usr/bin/chromium-browser'});
const page = await browser.newPage();
await page.setViewport({ width: 1920, height: 1080 });
await page.setDefaultNavigationTimeout(0);
for (var i = 0; i < street_names.length;i++) {
// console.log(street_names[i]); // Used to test if the loop works.
// Property Search Page
await page.goto('http://propaccess.traviscad.org/clientdb/PropertySearch.aspx?cid=1', {waitUntil: 'domcontentloaded'});
// type the enter street
await page.select('select[name="propertySearchOptions:recordsPerPage"]', '250'); // Select 250 results per page
await page.screenshot({path: 'screenshot.jpg', fullPage: true});
await page.focus('#propertySearchOptions_streetName');
await page.type('input[name="propertySearchOptions:streetName"]',street_names[i]);
//await page.keyboard.type('street_names[i]');
await page.click('#propertySearchOptions_searchAdv');
// Enter Results Page
await page.screenshot({path: 'street_names[i]_screenshot.jpg', fullPage: true});
await page._client.send('Page.setDownloadBehavior', {behavior: 'allow', downloadPath: './results'});
await page.waitForSelector('#footer');
await page.click('#propertySearchResults_exportResults');
await page.waitForTimeout (3500);
await page.screenshot({path: 'screenshot.jpg', fullPage: true});
}
await browser.close();
process.exit(1);
})();
I see that you defined the function inside the loop but you do not call the function

How to store console output as a variable Puppeteer

I'm having an issue storing the console output as a variable. I've created a short demo less than 20 lines of code when executed will output "hello how are you" to my node console, I'd like to store this text as a variable. Thank you
const puppeteer = require('puppeteer');
(async function main() {
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null
});
const Page = await browser.newPage();
await Page.goto('https://www.google.com/');
await Page.on('console', code => console.log(code.text()));
await Page.evaluate(_ => {
var b = "123"
console.log(b);
});
})();
You could use a promise, which will be resolved on the event, and then await for that.
const browser = await puppeteer.launch({
headless: false,
defaultViewport: null
});
let resolve;
var consoleLogPromise = new Promise(x => resolve = x);
const Page = await browser.newPage();
await Page.goto('https://www.google.com/');
await Page.on('console', code => resolve(code.text()));
await Page.evaluate(test);
var output = await consoleLogPromise;
console.log(output);

How to use multiple link in .goto(url) puppeteer?

const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true});
const page = await browser.newPage();
await page.goto('url/c-0');
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + "chapter");
} catch (error) {
}
})();
Hi all, currently i wanna to loop then :
url/c-0'
url/c-1'
url/c-2'
.....
please give me solutions thanks all.
Just loop your job. You could create a forloop to loop all chapters which you want to crawl (if your chapter urls have the same format).
const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
const endOfChapterNumber = 10; // number of chapters
for (const c = 0; c <= endOfChapterNumber; c++) {
const chapterUrl = 'url/c-' + c;
await page.goto(chapterUrl);
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + " chapter: " + c);
}
} catch (error) {
}
})();

use same browser instance?

Hi I am trying to make a screenshot service
const puppeteer = require('puppeteer');
var resWidth = 1366;
var resHeight = 1000;
var browser;
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
and when I receive a work I try to do
data.forEach(function(d){
try {
console.log(d["url"]);
(async () => {
var page = await browser.newPage();
await page.setViewport({width: resWidth, height: resHeight});
await page.goto(d["url"], {timeout: 90000, waitUntil: 'networkidle'});
await page.screenshot({path: './picdata/' + d['id'] + '.png' ,fullPage: true});
await page.close();
})();
} catch(e) {}
});
but I can't... here is the error:
(node:40596) UnhandledPromiseRejectionWarning: Unhandled promise rejection (rejection id: 7): TypeError: Cannot read property 'newPage' of undefined
I don't want to open a new browser for each screenshot launching browser takes time and requires more memory?
what should I do?
The problem:
(async () => {
browser = await puppeteer.launch({ignoreHTTPSErrors: true});
});
This code never gets executed. Why? because it's not a true closure.
More on closures, here.
That being said, that wont have work for your given scenario, as they are async tasks.
My try with your example:
'use strict';
const puppeteer = require('puppeteer');
const resWidth = 1366;
const resHeight = 1000;
let browser;
let page;
async function launchBrowser() {
browser = await puppeteer.launch({ headless: true }); //this "{ headless: true }" will ensure a browser window is not open a single time.
};
launchBrowser().then((x) => { // wait until browser has launched
data.forEach(async (d) => {
try {
page = await browser.newPage(); // the document is not very clear about this method, so I am still unsure if I could move this in the above launchBrowser() method.
await page.setViewport({ width: resWidth, height: resHeight });
await page.goto(d['url'], { timeout: 90000, waitUntil: 'networkidle' });
await page.screenshot({ path: './picdata/' + d['id'] + '.png', fullPage: true });
}
catch (e) {
console.log(e);
await browser.close(); // close browser if there is an error
}
});
})
.then(() => {
await browser.close(); // close browser finally.
});
Pro Tip: Start using let, const instead of var.
There is a great article on this, here

Resources