Puppeteer do not recognizing the link - node.js

I'm trying to grab some html in for expression, but somehow I'm getting error
Error: Evaluation failed: ReferenceError: link is not defined
at __puppeteer_evaluation_script__:8:29
at ExecutionContext.evaluateHandle (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\ExecutionContext.js:124:13)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at ExecutionContext.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:144:27)
at ExecutionContext.evaluate (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\ExecutionContext.js:58:31)
at ExecutionContext.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:145:23)
at Frame.evaluate (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\FrameManager.js:447:20)
at process._tickCallback (internal/process/next_tick.js:68:7)
-- ASYNC --
at Frame.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:144:27)
at Page.evaluate (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\Page.js:777:43)
at Page.<anonymous> (C:\Repositories\auto-grabber-server\node_modules\puppeteer\lib\helper.js:145:23)
at zrGrabber.StartGrabbingHtml (C:\Repositories\auto-grabber-server\grabbers\zr.grabber.js:52:40)
at process._tickCallback (internal/process/next_tick.js:68:7)
Link has been passed to the StartGrabbingHtml function, but then I,m getting mentioned error. I suppose that something is wrong with async staff, but can't get what exactly.
const puppeteer = require("puppeteer");
let links = [];
const Mongo = require('./../db/mongo');
const zrLinks = [
"https://www.zr.ru/stories/consultant/optimalno/",
"https://www.zr.ru/news/avtomobili/",
"https://www.zr.ru/stories/prezentaciya-car/new/"
];
module.exports = class zrGrabber {
async startGrabbingLinks() {
try {
for (let i = 0; i < zrLinks.length; i++) {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(zrLinks[i], {
waitUntil: 'load',
timeout: 0
});
const result = await page.evaluate(() => {
const links = document.querySelectorAll('div.head > h2 > a')
return [...links].map(link => link.href);
});
await page.close();
await browser.close();
links = [...links, ...result];
}
const db = new Mongo();
for (let i = 0; i < links.length; i++) {
// if link already in database skip grabbing
const found = await db.findLink(links[i]);
if (found) {
continue;
}
// else grab and write link to database
await this.StartGrabbingHtml(links[i])
}
} catch (err) {
console.log(err)
}
}
async StartGrabbingHtml(link) {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(link, {
waitUntil: 'load',
timeout: 0
});
const article = await page.evaluate(() => { // error throwing here
const date = document.querySelector('#storyDetailArticle > time').innerHTML;
const name = document.querySelector('#storyDetailArticle > h1').innerHTML;
const description = document.querySelector('#storyDetailArticle > div.stroy_announcement > h3').innerHTML;
const author = document.querySelector('#storyDetailArticle > div.announcement_author.story_author.no_preview > div').innerHTML;
const content = document.querySelector('#storyDetailArticle > div.stroy_content').innerHTML;
return {
source: link,
date: date,
name: name,
description: description,
author: author,
content: content
};
});
console.log(article)
const db = new Mongo();
await db.insertOne(article);
await page.close();
await browser.close();
} catch (err) {
console.log(err)
}
}
}
What I'm doing wrong here?

The script cannot access the variable link from inside the page.evaluate context.
You should pass it as an argument like this:
await page.evaluate(link => {
// ...
}, link);

Related

How to use Node.js crawler Web

I expecting the product info will be printed when displayed. However, the current code will show all items loaded even if they're not shown yet.
How do i modify my code, thank you
// const request = require("request");
const cheerio = require("cheerio");
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false // 無外殼的 Chrome,有更佳的效能
});
const page = await browser.newPage();
await page.goto('https://www.balenciaga.com/en-us/women/shoes/sneakers');
await getData(page)
await scrollItem(page)
})();
const scrollItem = async (page) => {
pageHeight = await page.evaluate('document.body.scrollHeight')
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)',
await page.waitForFunction(`document.body.scrollHeight > ${pageHeight}`),
await getData(page)
)
}
const getData = async (page) => {
let body = await page.content()
let $ = await cheerio.load(body)
const data = []
const list = $(".l-productgrid__item .c-product__infos");
for (let i = 0; i < list.length; i++) {
const title = list.eq(i).find('.c-product__infos h2').text();
const price = list.eq(i).find('.c-product__infos p').text().trim();
data.push({ title, price });
}
data.forEach((res, i) => {
console.log(`${i+1} 名稱: ${res.title}, 價錢: ${res.price}`)
})
await scrollItem(page)
}
working code:
// define function which accepts body and cheerio as args
function extract(input, cheerio) {
// return object with extracted values
let $ = cheerio.load(input);
return $('.l-productgrid__item .c-product__infos').map(function() {
return {
header: $('h2', this).text().trim(),
price: $('p', this).text().trim()
}
}).toArray()
}
proof of work (screenshot)

Scraper with Puppeteer login returns just one element of the array

This code is supposed to loop through the urls that get scraped from the scrapeProductPage function. But before looping, it needs to log in so that it can obtain the prices. The prices are only displayed to logged in users. Instead of looping through the urls it just returns the scraped data from one page. The error I get is "MaxListenersExceededWarning: Possible EventEmitter memory leak detected".
const request = require("request-promise");
const cheerio = require("cheerio");
const ObjectsToCsv = require("objects-to-csv");
const puppeteer = require('puppeteer');
const url = "https://www.example.com";
const scrapeResults = [];
async function scrapeProductPage() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("td.productListing-data > a[style='position:relative;float:left;']").each((index, element) => {
let url = $(element).attr("href");
url = "https\://www.example.com/" + url;
const scrapeResult = { url };
scrapeResults.push(scrapeResult);
});
return scrapeResults;
} catch (err) {
console.error(err);
}
}
async function scrapeDescription(productsWithImages) {
process.setMaxListeners(0);
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.example.com/login');
await page.waitFor(500);
await page.waitFor('input[name="email_address"]');
await page.type('input[name="email_address"]', 'example#gmail.com');
await page.type('input[name="password"]', '123test');
await page.click('#btnLogin');
return await Promise.all(
productsWithImages.map(async job => {
try {
await page.goto(job.url, { waitUntil: "load" });
const content = await page.content();
const $ = await cheerio.load(content);
job.main_img = $('img#main_img').attr('src');
job.name = $('h2').text();
job.price = $("td.products_info_price").text();
return job;
} catch (error) {
console.error(error);
}
})
);
}
async function saveDataToCsv(data) {
const csv = new ObjectsToCsv(data);
console.log(csv);
}
async function scrapeWona() {
const productsWithImages = await scrapeProductPage();
const wonaFullData = await scrapeDescription(productsWithImages);
await saveDataToCsv(productsWithImages);
}
scrapeWona();
The reason you're getting the warning is because of process.setMaxListeners(0)
Indicates you have a memory leak somewhere in the code.
You can take a look at the documentation here also: https://nodejs.org/docs/latest/api/events.html#events_emitter_setmaxlisteners_n
Take a look at the answer from here: node.js - request - How to "emitter.setMaxListeners()"?

Puppeteer wrong result with evaluate() & exposeFunction()

I ran the following and it appears to gather a large number of links, however on actual inspection of the site with collectLinks1 I get all valid links, but with collectLinks2 I got 59 iterations of http://pieroxy.net/blog/2014/11/18/[
I'm new to Puppeteer and I can't find out why with collectLinks2 I don't get the links.
const { parse, resolve } = require('url');
const trim = require('lodash/trim');
const startsWith = require('lodash/startsWith');
const includes = require('lodash/includes');
// https://github.com/GoogleChrome/puppeteer
const puppeteer = require('puppeteer');
// https://github.com/gwuhaolin/chrome-finder
const findChrome = require('chrome-finder');
function resolveUrl(url, baseUrl) {
url = trim(url);
if (!url) return null;
if (startsWith(url, '#')) return null;
const { protocol } = parse(url);
if (includes(['http:', 'https:'], protocol)) {
return url.split('#')[0];
} if (!protocol) {
return resolve(baseUrl, url).split('#')[0];
}
return null;
}
async function collectLinks1(htmlPage) {
const baseUrl = htmlPage.url();
const links = [];
const assetUrls = await htmlPage.$$eval('a[href]', assetLinks => assetLinks.map(link => link.href));
assetUrls.forEach(link => {
const _link = resolveUrl(link, baseUrl);
if (_link) links.push(_link);
});
return links;
}
async function collectLinks2(htmlPage) {
const baseUrl = htmlPage.url();
const links = [];
await htmlPage.exposeFunction('pushToLinks', link => {
const _link = resolveUrl(link, baseUrl);
if (_link) links.push(_link);
});
await htmlPage.evaluate(() => {
function findLinks(document) {
document.querySelectorAll('a[href]')
.forEach(link => {
window.pushToLinks(link.href);
});
}
findLinks(window.document);
});
return links;
}
const crawl = async url => {
try {
console.log(`Crawling ${url}`);
const browser = await puppeteer.launch({
headless: false,
executablePath: findChrome(),
});
const page = await browser.newPage();
await page.goto(url);
// OK
const links1 = await collectLinks1(page);
links1.forEach(link => { console.log(link); });
// KO
const links2 = await collectLinks2(page);
links2.forEach(link => { console.log(link); });
await browser.close();
} catch (err) {
console.log(err);
}
};
crawl('http://pieroxy.net/blog/2014/11/18/user_agent_detection_in_java.html');
You need to await the function defined via page.exposeFunction as it returns a Promise. As you are only calling the function but not awaiting its result, your page.evaluate call will resolve before your script finished executing.
Solution
Instead of the forEach, you should use a loop to iterate over all the items and communicate them to the page one after another.
async function collectLinks2(htmlPage) {
// ...
await htmlPage.evaluate(async () => {
async function findLinks(document) {
for (const link of document.querySelectorAll('a[href]')) {
await window.pushToLinks(link.href);
}
}
await findLinks(window.document);
});
return links;
}

puppeteer : cant log in and loop through urls

Hi Guys I want to log in a website and once authenticated want to loop through a given set of URLS and scrape data. What I intend to do can be described by this example,however I get Unhandled promise rejection.
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
console.log(title);
}
catch(e) {
console.log(e)
}
return title
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
getTitle(page, url)
}
await browser.close();
})();
There are multiple issues in this example.
You should await the call to function getTitle, you re awaiting inside the function but you have to await the call to the function too.
You should surround getTitle with a try and catch block and check inside the function if theres a title to return (ex. the title for google is null)
const puppeteer = require("puppeteer");
list = [
"https://www.facebook.com/",
"https://www.google.com/",
"https://www.zocdoc.com/"
];
const getTitle = async (p, url) => {
try{
await p.goto(url);
const title = await p.title();
if(title){
return title
}
}
catch(e) {
throw(e)
console.log(e)
}
};
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
console.log(this)
for (var url of list) {
try{
console.log(await getTitle(page, url))
}
catch(e ){
console.log('No title')
}
}
await browser.close();
})();

How to return value from async/await function?

Using puppeteer to collect data from 2 different webpages into arrays for later comparison. However the program does not wait for the returned array before carrying forward.
async function go(){
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++){
const td = tds[i];
const tdcontent = await page.evaluate(td => td.innerText, td);
if (tdcontent.length > 5) {
data[i] = {"content": tdcontent};
}
}
return data;
} catch (e) {
console.log(e);
}
};
(async function main(){
const returnedData = await go();
console.log(returnedData.length);
})();
The return data.length is 0. New to nodejs, and async programming structure. I think it is because the .length is logged before the data is returned?
how do I return the data in a way where can manipulate it and complete my comparisons?
I try to not use page.$$ in such cases. Instead I use document.querySelectorAll and map thru the elements and extract the text.
Here is the modified code:
const getTdData = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("http://example.com");
return page.evaluate(() => {
// get all td elements
const tdList = [...document.querySelectorAll("td")];
return tdList.map(element => ({ content: element.textContent }));
});
} catch (e) {
console.log(e);
}
};
(async function main() {
const returnedData = await getTdData();
console.log(returnedData.length);
})();
First of all, you are missing an apostrophe in your page.$$() function. You should change this to:
const tds = await page.$$('td');
Next, you are trying to pass a non-existent variable to page.evaluate(). You can fix this by passing tds[i] instead of td:
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
Your final result should look something like this:
const go = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++) {
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
if (tdcontent.length > 5) {
data[i] = {
content: tdcontent,
};
}
}
return data;
} catch (error) {
console.log(error);
}
};
(async function main() {
const returnedData = await go();
console.log(returnedData.length);
})();
If you are are still experiencing issues, you may want to wait until the page has loaded completely using page.goto( ... , { waitUntil: 'networkidle0' }), or wait until the element in question has been added to the DOM using page.waitForSelector():
await page.goto('www.webpage.com' , {
waitUntil: 'networkidle0',
});
// ...
await page.waitForSelector('td');

Resources