How to use Node.js crawler Web - node.js

I expecting the product info will be printed when displayed. However, the current code will show all items loaded even if they're not shown yet.
How do i modify my code, thank you
// const request = require("request");
const cheerio = require("cheerio");
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false // 無外殼的 Chrome,有更佳的效能
});
const page = await browser.newPage();
await page.goto('https://www.balenciaga.com/en-us/women/shoes/sneakers');
await getData(page)
await scrollItem(page)
})();
const scrollItem = async (page) => {
pageHeight = await page.evaluate('document.body.scrollHeight')
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)',
await page.waitForFunction(`document.body.scrollHeight > ${pageHeight}`),
await getData(page)
)
}
const getData = async (page) => {
let body = await page.content()
let $ = await cheerio.load(body)
const data = []
const list = $(".l-productgrid__item .c-product__infos");
for (let i = 0; i < list.length; i++) {
const title = list.eq(i).find('.c-product__infos h2').text();
const price = list.eq(i).find('.c-product__infos p').text().trim();
data.push({ title, price });
}
data.forEach((res, i) => {
console.log(`${i+1} 名稱: ${res.title}, 價錢: ${res.price}`)
})
await scrollItem(page)
}

working code:
// define function which accepts body and cheerio as args
function extract(input, cheerio) {
// return object with extracted values
let $ = cheerio.load(input);
return $('.l-productgrid__item .c-product__infos').map(function() {
return {
header: $('h2', this).text().trim(),
price: $('p', this).text().trim()
}
}).toArray()
}
proof of work (screenshot)

Related

Speed up scrappers

I have been scraping for some time now, and recently started using node and puppeteer for some projects. I build this scraper to collect telegram links from this crypto coin marketplace site. But it's kinda slow, but I don't really know where to start to figure out how to speed it up. So my question is, how do I learn to speed up my web scrappers without losing information that is collected??
Here is what I have now it tries to scrape the telegram links from about, 10000 different coin pages then saves those links to a csv.
const puppeteer = require('puppeteer');
const ObjectsToCsv = require('objects-to-csv');
(async () => {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null,
args: ['--start-maximized']
});
const page = await browser.newPage();
// const baseUrl = "https://coinmarketcap.com/"
let totalTelegramLinks = []
for (let i = 50; i < 101;i++){
await page.goto(`https://coinmarketcap.com/?page=${i}`, {waitUntil : 'networkidle2' }).catch(e => void 0);
console.log(`[+] Scraping Page ${i}`);
await autoScroll(page);
let allLinks = []
const grabedTableLinks = await page.evaluate(() => {
const aTags = Array.from(document.querySelectorAll('table.cmc-table tbody tr td div.sc-16r8icm-0.escjiH a.cmc-link'))
return aTags.map(a=>a.getAttribute('href'))
})
// allLinks.push([...new Set([...grabedTableLinks, ...allLinks])])
allLinks.push(...grabedTableLinks)
await page.screenshot({
path: 'yoursite.png',
fullPage: true
});
// console.log(allLinks);
console.log(allLinks.length);
// const await clickCoinLinks(page, allLinks)
totalTelegramLinks.push(...(await clickCoinLinks(page, allLinks)))
}
saveToFile(totalTelegramLinks)
console.log('\u0007')
await browser.close();
})();
const telegramRegex = new RegExp('(?:http|https):\/\/(?:t\.me|telegram\.me)\/.*')
const baseUrl = "https://coinmarketcap.com"
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}
async function clickCoinLinks(page, links){
let navigations = 0
let totalLinks = []
for (const url of links){
await page.goto(`${baseUrl}${url}`,{waitUntil : 'networkidle2' }).catch(e => void 0)
navigations++
const title = await page.title()
// console.log('---------')
// console.log(title)
const simpleLinkBtns = await page.$$('a.link-button')
let telegramLinks = await linkHandler(simpleLinkBtns, page)
if (telegramLinks.length){
totalLinks.push(...telegramLinks)
// telegramLinks.forEach(link => console.log(link))
}else{
// console.log('[-] No Immediate Link');
const hoverLinkBtns = await page.$$('button.link-button')
telegramLinks = await dropdownBtnHandler(hoverLinkBtns, page)
// console.log('Testing for dropdown link');
if (telegramLinks.length) totalLinks.push(...telegramLinks);
// telegramLinks ? telegramLinks.forEach(link => console.log(link)) : console.log('No dropdown Link either')
}
}
// console.log(totalLinks);
return totalLinks
}
const linkHandler = async (eleHandles, page)=>{
let linkUrls = []
for (const aTag of eleHandles){
linkUrls.push(await (await aTag.getProperty('href')).jsonValue())
}
const telegramLink = testLinks(linkUrls, page)
return telegramLink
}
async function dropdownBtnHandler(eleHandles, page){
let linkUrls = []
let telegramLink
for (const btn of eleHandles){
const btnText = await (await btn.getProperty('innerText')).jsonValue()
if(btnText == 'Chat'){
await btn.hover()
const dropdownLinks = await page.$$('li > a.dropdownItem')
for (const aTag of dropdownLinks){
const hrefVal = await (await aTag.getProperty('href')).jsonValue();
linkUrls.push(hrefVal)
}
telegramLink = testLinks(linkUrls, page)
}
}
return telegramLink ? telegramLink : []
}
const testLinks = async (links, page) =>{
const coin = await page.url().split('/').at(-2)
let telegramLinks = []
let coinLinks = []
links.forEach(link => {
if (telegramRegex.test(link)){
coinLinks.push(link)
}
})
// console.log(telegramLinks);
if(coinLinks.length){
const linkObj = {}
linkObj['coin'] = coin
linkObj['telegram_links'] = coinLinks
telegramLinks.push(linkObj)
}
return telegramLinks
}
const saveToFile = async (links) =>{
const csv = new ObjectsToCsv(links);
// Save to file:
await csv.toDisk('./telegram_links.csv');
// Return the CSV file as string:
// console.log(await csv.toString());
}

Requests for multiple pages with puppeteer

I am trying to get information from many sites (links from array) which have dynamically content (emails and names of companies) with puppeteer. I use "for" cycle to iterate array with links, do page.goto... to each site, wait until the site is loaded , wait several seconds for dynamical content, and begin doing requests. But i have first and last request completed (Promises resolve). Other promises don't return me dynamical content. What should i do for fix that? Thanks
let puppeteer = require('puppeteer');
(async() => {
const browser = await puppeteer.launch();
let page = await browser.newPage();
const url = 'https://abcdsite.com/';
let arrayNames = ['first','second','third','abcd'];
for(let i=0;i<await arrayNames.length;){
let nameUrl = await arrayNames[i];
if (i<4){
let temp1;
console.log(`begin for ${nameUrl}`);
await page.goto(`${url}${nameUrl}`, { waitUntil: 'load' })
.then(()=>{
return new Promise(res=>{
//wait content dynamic load
setTimeout(()=>{
temp1 = page.evaluate(() => {
return new Promise(resolve => { // <-- return the data to node.js from browser
let name = document.querySelector('h1').innerHTML;
let email = document.getElementsByClassName('sidebar-views-contacts h-card vcard')[0]
.children[2].children[0].children[0].innerHTML;
resolve(email);
});
});
res(temp1);
},7000);
})
})
.then((res)=>{
i++;
console.log(`https://abcdsite.com/${nameUrl}`,temp1);
});
}
else{
break
}
}
})();
I think this helps you.
1) make an async function to request and parse your data
2) create an array of parallel tasks.
let puppeteer = require('puppeteer');
async function makeRequest(page, url, nameUrl) {
await page.goto(`${url}${nameUrl}`, { waitUntil: 'load' });
setTimeout(() => {
const userEmail = await page.evaluate(() => {
let name = document.querySelector('h1').innerHTML;
let email = document.getElementsByClassName('sidebar-views-contacts h-card vcard')[0]
.children[2].children[0].children[0].innerHTML;
return email;
});
return Promise.resolve(userEmail);
}, 7000);
}
(async () => {
const browser = await puppeteer.launch();
let page = await browser.newPage();
const url = 'https://abcdsite.com/';
let arrayNames = ['first', 'second', 'third', 'abcd'];
let tasks = [];
for (let i = 0; i < arrayNames.length; i++) {
tasks.push(makeRequest(page, url, arrayNames[i]));
}
Promise.all(tasks)
.then((res) => {
for (let i = 0; i < arrayNames.length; i++) {
console.log(`https://abcdsite.com/${arrayNames[i]}`, res[i]);
}
});
})();
Series solution
For more information read this.
for (let i = 0; i < arrayNames.length; i++) {
let temp = await makeRequest(page, url, arrayNames[i]);
console.log(`https://abcdsite.com/${arrayNames[i]}`, temp);
}
puppeteer's page.goto function has multiple parameters you can use to ensure that the page is fully loaded. See the documentation here.
In addition, you can use the page.waitFor method to wait for a few seconds. See documentation here.
Here you have a simple example that I think may work for you:
const puppeteer = require('puppeteer')
const url = 'https://stackoverflow.com/'
const arrayNames = ['tags', 'users', 'jobs', 'questions'];
(async () => {
const browser = await puppeteer.launch()
const page = await browser.newPage()
const data = {}
for (const nameUrl of arrayNames) {
const fullUrl = `${url}${nameUrl}`
console.log(`begin for ${fullUrl}`)
await page.goto(fullUrl, { waitUntil: 'networkidle0' }) // check networkidle0 parameter and others here: https://pptr.dev/#?product=Puppeteer&version=v2.1.1&show=api-pagegotourl-options
await page.waitFor(2000) // wait 2 seconds to allow a full login. Optional
const pageData = await page.evaluate(() => {
const name = document.querySelector('h1').innerText
const pageTitle = document.querySelector('title').innerText
// get whatever data you need to get from the page.
return { name: name, title: pageTitle }
})
console.log('\t Data from page: ', pageData)
data[fullUrl] = pageData
}
console.log(data)
})()
This does not run all sites in parallel, but you can then play around with the example.
Instead of 'awaiting' the await page.evaluate part, you could get all the promises in an array and then use await Promise.all([listOfPromises])

Scraper with Puppeteer login returns just one element of the array

This code is supposed to loop through the urls that get scraped from the scrapeProductPage function. But before looping, it needs to log in so that it can obtain the prices. The prices are only displayed to logged in users. Instead of looping through the urls it just returns the scraped data from one page. The error I get is "MaxListenersExceededWarning: Possible EventEmitter memory leak detected".
const request = require("request-promise");
const cheerio = require("cheerio");
const ObjectsToCsv = require("objects-to-csv");
const puppeteer = require('puppeteer');
const url = "https://www.example.com";
const scrapeResults = [];
async function scrapeProductPage() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("td.productListing-data > a[style='position:relative;float:left;']").each((index, element) => {
let url = $(element).attr("href");
url = "https\://www.example.com/" + url;
const scrapeResult = { url };
scrapeResults.push(scrapeResult);
});
return scrapeResults;
} catch (err) {
console.error(err);
}
}
async function scrapeDescription(productsWithImages) {
process.setMaxListeners(0);
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.example.com/login');
await page.waitFor(500);
await page.waitFor('input[name="email_address"]');
await page.type('input[name="email_address"]', 'example#gmail.com');
await page.type('input[name="password"]', '123test');
await page.click('#btnLogin');
return await Promise.all(
productsWithImages.map(async job => {
try {
await page.goto(job.url, { waitUntil: "load" });
const content = await page.content();
const $ = await cheerio.load(content);
job.main_img = $('img#main_img').attr('src');
job.name = $('h2').text();
job.price = $("td.products_info_price").text();
return job;
} catch (error) {
console.error(error);
}
})
);
}
async function saveDataToCsv(data) {
const csv = new ObjectsToCsv(data);
console.log(csv);
}
async function scrapeWona() {
const productsWithImages = await scrapeProductPage();
const wonaFullData = await scrapeDescription(productsWithImages);
await saveDataToCsv(productsWithImages);
}
scrapeWona();
The reason you're getting the warning is because of process.setMaxListeners(0)
Indicates you have a memory leak somewhere in the code.
You can take a look at the documentation here also: https://nodejs.org/docs/latest/api/events.html#events_emitter_setmaxlisteners_n
Take a look at the answer from here: node.js - request - How to "emitter.setMaxListeners()"?

How to use multiple link in .goto(url) puppeteer?

const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true});
const page = await browser.newPage();
await page.goto('url/c-0');
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + "chapter");
} catch (error) {
}
})();
Hi all, currently i wanna to loop then :
url/c-0'
url/c-1'
url/c-2'
.....
please give me solutions thanks all.
Just loop your job. You could create a forloop to loop all chapters which you want to crawl (if your chapter urls have the same format).
const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
const endOfChapterNumber = 10; // number of chapters
for (const c = 0; c <= endOfChapterNumber; c++) {
const chapterUrl = 'url/c-' + c;
await page.goto(chapterUrl);
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + " chapter: " + c);
}
} catch (error) {
}
})();

How to return value from async/await function?

Using puppeteer to collect data from 2 different webpages into arrays for later comparison. However the program does not wait for the returned array before carrying forward.
async function go(){
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++){
const td = tds[i];
const tdcontent = await page.evaluate(td => td.innerText, td);
if (tdcontent.length > 5) {
data[i] = {"content": tdcontent};
}
}
return data;
} catch (e) {
console.log(e);
}
};
(async function main(){
const returnedData = await go();
console.log(returnedData.length);
})();
The return data.length is 0. New to nodejs, and async programming structure. I think it is because the .length is logged before the data is returned?
how do I return the data in a way where can manipulate it and complete my comparisons?
I try to not use page.$$ in such cases. Instead I use document.querySelectorAll and map thru the elements and extract the text.
Here is the modified code:
const getTdData = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("http://example.com");
return page.evaluate(() => {
// get all td elements
const tdList = [...document.querySelectorAll("td")];
return tdList.map(element => ({ content: element.textContent }));
});
} catch (e) {
console.log(e);
}
};
(async function main() {
const returnedData = await getTdData();
console.log(returnedData.length);
})();
First of all, you are missing an apostrophe in your page.$$() function. You should change this to:
const tds = await page.$$('td');
Next, you are trying to pass a non-existent variable to page.evaluate(). You can fix this by passing tds[i] instead of td:
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
Your final result should look something like this:
const go = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++) {
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
if (tdcontent.length > 5) {
data[i] = {
content: tdcontent,
};
}
}
return data;
} catch (error) {
console.log(error);
}
};
(async function main() {
const returnedData = await go();
console.log(returnedData.length);
})();
If you are are still experiencing issues, you may want to wait until the page has loaded completely using page.goto( ... , { waitUntil: 'networkidle0' }), or wait until the element in question has been added to the DOM using page.waitForSelector():
await page.goto('www.webpage.com' , {
waitUntil: 'networkidle0',
});
// ...
await page.waitForSelector('td');

Resources