How do I scrape images from udemy using NodeJS and puppeteer - node.js

This is my code - scraping courses titles works ok, but I have problem with images
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin());
const fs = require('fs/promises')
function sleep(ms)
{
return new Promise(resolve => setTimeout(resolve, ms));
}
async function start()
{
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
await page.goto("https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL");
await sleep(5000);
const names = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".course-list--container--3zXPS div.udlite-focus-visible-target.udlite-heading-md.course-card--course-title--vVEjC")).map(x => x.textContent)
})
const images = await page.evaluate(() => {
return Array.from(
document.querySelectorAll(".course-list--container--3zXPS div.course-card--image-wrapper--1F9ny")
).map((image) => image.getAttribute(`src`));
});
let m = ";";
for (let i = 0; i < names.length; i++)
{
names[i] = i+m+names[i]+m+images[i]
}
await fs.writeFile("courses.txt", names.join("\r\n"))
await page.screenshot({ path: "udemy.png", fullPage: true });
await browser.close();
}
start()
Now it returns null instead images url, if I change src to srcset nothing changes.
The page that I want to scrape the images from is https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL
On a screenshot that this script takes I can see that courses icons are blacked out. I can scrape images that are visible on screenshot, but not those that are blacked out.

Ok I found the answer - I added setVievport function and img at the end of QuerySelectorAll
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth')
puppeteer.use(StealthPlugin());
const fs = require('fs/promises')
function sleep(ms)
{
return new Promise(resolve => setTimeout(resolve, ms));
}
async function start()
{
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.goto("https://www.udemy.com/pl/courses/development/web-development/?lang=pl&sort=popularity&persist_locale=&locale=pl_PL", { "waitUntil": "networkidle0" });
await sleep(1000);
const bodyWidth = await page.evaluate(() => document.body.scrollWidth);
const bodyHeight = await page.evaluate(() => document.body.scrollHeight);
await page.setViewport({ width: bodyWidth, height: bodyHeight });
await sleep(1000);
const names = await page.evaluate(() => {
return Array.from(document.querySelectorAll(".course-list--container--3zXPS div.udlite-focus-visible-target.udlite-heading-md.course-card--course-title--vVEjC")).map(x => x.textContent)
})
const images = await page.evaluate(() => {
return Array.from(
document.querySelectorAll(".course-list--container--3zXPS div.course-card--image-wrapper--1F9ny img")
).map((image) => image.getAttribute(`src`));
});
let m = ";";
for (let i = 0; i < names.length; i++)
{
names[i] = i+m+names[i]+m+images[i]
}
await fs.writeFile("courses.txt", names.join("\r\n"))
await page.screenshot({ path: "udemy.png", fullPage: true });
await browser.close();
}
start()

Related

Speed up scrappers

I have been scraping for some time now, and recently started using node and puppeteer for some projects. I build this scraper to collect telegram links from this crypto coin marketplace site. But it's kinda slow, but I don't really know where to start to figure out how to speed it up. So my question is, how do I learn to speed up my web scrappers without losing information that is collected??
Here is what I have now it tries to scrape the telegram links from about, 10000 different coin pages then saves those links to a csv.
const puppeteer = require('puppeteer');
const ObjectsToCsv = require('objects-to-csv');
(async () => {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null,
args: ['--start-maximized']
});
const page = await browser.newPage();
// const baseUrl = "https://coinmarketcap.com/"
let totalTelegramLinks = []
for (let i = 50; i < 101;i++){
await page.goto(`https://coinmarketcap.com/?page=${i}`, {waitUntil : 'networkidle2' }).catch(e => void 0);
console.log(`[+] Scraping Page ${i}`);
await autoScroll(page);
let allLinks = []
const grabedTableLinks = await page.evaluate(() => {
const aTags = Array.from(document.querySelectorAll('table.cmc-table tbody tr td div.sc-16r8icm-0.escjiH a.cmc-link'))
return aTags.map(a=>a.getAttribute('href'))
})
// allLinks.push([...new Set([...grabedTableLinks, ...allLinks])])
allLinks.push(...grabedTableLinks)
await page.screenshot({
path: 'yoursite.png',
fullPage: true
});
// console.log(allLinks);
console.log(allLinks.length);
// const await clickCoinLinks(page, allLinks)
totalTelegramLinks.push(...(await clickCoinLinks(page, allLinks)))
}
saveToFile(totalTelegramLinks)
console.log('\u0007')
await browser.close();
})();
const telegramRegex = new RegExp('(?:http|https):\/\/(?:t\.me|telegram\.me)\/.*')
const baseUrl = "https://coinmarketcap.com"
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}
async function clickCoinLinks(page, links){
let navigations = 0
let totalLinks = []
for (const url of links){
await page.goto(`${baseUrl}${url}`,{waitUntil : 'networkidle2' }).catch(e => void 0)
navigations++
const title = await page.title()
// console.log('---------')
// console.log(title)
const simpleLinkBtns = await page.$$('a.link-button')
let telegramLinks = await linkHandler(simpleLinkBtns, page)
if (telegramLinks.length){
totalLinks.push(...telegramLinks)
// telegramLinks.forEach(link => console.log(link))
}else{
// console.log('[-] No Immediate Link');
const hoverLinkBtns = await page.$$('button.link-button')
telegramLinks = await dropdownBtnHandler(hoverLinkBtns, page)
// console.log('Testing for dropdown link');
if (telegramLinks.length) totalLinks.push(...telegramLinks);
// telegramLinks ? telegramLinks.forEach(link => console.log(link)) : console.log('No dropdown Link either')
}
}
// console.log(totalLinks);
return totalLinks
}
const linkHandler = async (eleHandles, page)=>{
let linkUrls = []
for (const aTag of eleHandles){
linkUrls.push(await (await aTag.getProperty('href')).jsonValue())
}
const telegramLink = testLinks(linkUrls, page)
return telegramLink
}
async function dropdownBtnHandler(eleHandles, page){
let linkUrls = []
let telegramLink
for (const btn of eleHandles){
const btnText = await (await btn.getProperty('innerText')).jsonValue()
if(btnText == 'Chat'){
await btn.hover()
const dropdownLinks = await page.$$('li > a.dropdownItem')
for (const aTag of dropdownLinks){
const hrefVal = await (await aTag.getProperty('href')).jsonValue();
linkUrls.push(hrefVal)
}
telegramLink = testLinks(linkUrls, page)
}
}
return telegramLink ? telegramLink : []
}
const testLinks = async (links, page) =>{
const coin = await page.url().split('/').at(-2)
let telegramLinks = []
let coinLinks = []
links.forEach(link => {
if (telegramRegex.test(link)){
coinLinks.push(link)
}
})
// console.log(telegramLinks);
if(coinLinks.length){
const linkObj = {}
linkObj['coin'] = coin
linkObj['telegram_links'] = coinLinks
telegramLinks.push(linkObj)
}
return telegramLinks
}
const saveToFile = async (links) =>{
const csv = new ObjectsToCsv(links);
// Save to file:
await csv.toDisk('./telegram_links.csv');
// Return the CSV file as string:
// console.log(await csv.toString());
}

How to scrape infinite scroll websites using Puppeteer [duplicate]

This question already has answers here:
Puppeteer - scroll down until you can't anymore
(11 answers)
Closed 2 years ago.
I am trying to scrape a website which has infinite scrolling.
I am controlling the scroll but still, it exits after reaching at the end of the webpage.
This is my code:
const puppeteer = require("puppeteer");
module.exports.scraper = async (url, callBack) => {
const browser = await puppeteer.launch({ headless: false });
const page = await browser.newPage();
await page.setUserAgent(
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36",
);
await page.setViewport({ width: 1200, height: 768 });
function wait(ms) {
return new Promise((resolve) => setTimeout(() => resolve(), ms));
}
await page.goto(`${url}/products/?department=men&l2_category=polos-t-shirts`, {
waitUntil: "networkidle0",
});
// Get the height of the rendered page
const bodyHandle = await page.$("body");
const { height } = await bodyHandle.boundingBox();
await bodyHandle.dispose();
// Scroll one viewport at a time, pausing to let content load
const viewportHeight = page.viewport().height;
let viewportIncr = 0;
while (viewportIncr + viewportHeight < height) {
await page.evaluate((_viewportHeight) => {
window.scrollBy(0, _viewportHeight);
}, viewportHeight);
await wait(1600);
viewportIncr = viewportIncr + viewportHeight;
}
let data = await page.evaluate(() => {
window.scrollTo(0, 0);
let products = [];
let productElements = document.querySelectorAll(".product-wrap");
productElements.forEach((productElement) => {
let productJson = {};
try {
productJson.imageUrl = productElement.querySelector(".renderedImg").src;
productJson.brandName = productElement.querySelector(
".brand-name",
).innerText;
} catch (e) {
console.log(e);
}
products.push(productJson);
});
return products;
});
await wait(100);
callBack(data, true);
await browser.close();
};
How to scrape in such situation?
Here's one strategy to handle infinite scrolling. It repeats a scroll/compare in a loop until scrolling has no effect. i.e. when we tell it to scroll, but we're still at the same scrollTop value we were last iteration, consider it done. In extreme cases the browser will eventually run out of heap memory and crash, but this is our starting point for the average site:
const puppeteer = require('puppeteer');
const url = 'https://example.com';
(async () => {
const browser = await puppeteer.launch();
const page = await browser.newPage();
page.on('console', async msg => {
const args = msg.args();
const vals = [];
for (let i = 0; i < args.length; i++) {
vals.push(await args[i].jsonValue());
}
console.log(vals.join('\t'));
});
await page.goto(url);
await page.evaluate(()=> {
const wait = (duration) => {
console.log('waiting', duration);
return new Promise(resolve => setTimeout(resolve, duration));
};
(async () => {
window.atBottom = false;
const scroller = document.documentElement; // usually what you want to scroll, but not always
let lastPosition = -1;
while(!window.atBottom) {
scroller.scrollTop += 1000;
// scrolling down all at once has pitfalls on some sites: scroller.scrollTop = scroller.scrollHeight;
await wait(300);
const currentPosition = scroller.scrollTop;
if (currentPosition > lastPosition) {
console.log('currentPosition', currentPosition);
lastPosition = currentPosition;
}
else {
window.atBottom = true;
}
}
console.log('Done!');
})();
});
await page.waitForFunction('window.atBottom == true', {
timeout: 900000,
polling: 1000 // poll for finish every second
});
await page.screenshot({path: 'example.png', fullPage: true});
await browser.close();
})();

How to use multiple link in .goto(url) puppeteer?

const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true});
const page = await browser.newPage();
await page.goto('url/c-0');
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + "chapter");
} catch (error) {
}
})();
Hi all, currently i wanna to loop then :
url/c-0'
url/c-1'
url/c-2'
.....
please give me solutions thanks all.
Just loop your job. You could create a forloop to loop all chapters which you want to crawl (if your chapter urls have the same format).
const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
const endOfChapterNumber = 10; // number of chapters
for (const c = 0; c <= endOfChapterNumber; c++) {
const chapterUrl = 'url/c-' + c;
await page.goto(chapterUrl);
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + " chapter: " + c);
}
} catch (error) {
}
})();

I would like to know how to use $$eval from Puppeteer

I can not use $$ eval well.
(async() => {
const browser = await puppeteer.launch({ executablePath: chrome ,args: [chromeArgs]});
const page = await browser.newPage();
await page.goto('https://www.example.com/', {waitUntil: "domcontentloaded"});
var links = await page.evaluate(() => {
var hreflist = [];
var tags = document.querySelectorAll("p");
Array.prototype.forEach.call(tags, (tag)=>{
hreflist.push(tag.textContent);
});
return hreflist;
});
console.log(util.inspect(links, false, null));
browser.close();
})();
I would like to do the same thing as the source code written above.
(async() => {
const browser = await puppeteer.launch({ executablePath: chrome ,args: [chromeArgs]});
const page = await browser.newPage();
await page.goto('https://www.example.com/', {waitUntil: "domcontentloaded"});
var links = await page.$$eval('p', list => {
list.map(data => {
data.textContent
})
});
console.log(util.inspect(links, false, null));
browser.close();
})();
The execution result of $$eval() is undefined.
https://pptr.dev/#?product=Puppeteer&version=v1.10.0&show=api-pageevalselector-pagefunction-args
I saw the official document.
However, we can not confirm the problem.
You forgot to return the value. this will work
var links = await page.$$eval('p', list => list.map(data => data.textContent));

How to return value from async/await function?

Using puppeteer to collect data from 2 different webpages into arrays for later comparison. However the program does not wait for the returned array before carrying forward.
async function go(){
try{
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++){
const td = tds[i];
const tdcontent = await page.evaluate(td => td.innerText, td);
if (tdcontent.length > 5) {
data[i] = {"content": tdcontent};
}
}
return data;
} catch (e) {
console.log(e);
}
};
(async function main(){
const returnedData = await go();
console.log(returnedData.length);
})();
The return data.length is 0. New to nodejs, and async programming structure. I think it is because the .length is logged before the data is returned?
how do I return the data in a way where can manipulate it and complete my comparisons?
I try to not use page.$$ in such cases. Instead I use document.querySelectorAll and map thru the elements and extract the text.
Here is the modified code:
const getTdData = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto("http://example.com");
return page.evaluate(() => {
// get all td elements
const tdList = [...document.querySelectorAll("td")];
return tdList.map(element => ({ content: element.textContent }));
});
} catch (e) {
console.log(e);
}
};
(async function main() {
const returnedData = await getTdData();
console.log(returnedData.length);
})();
First of all, you are missing an apostrophe in your page.$$() function. You should change this to:
const tds = await page.$$('td');
Next, you are trying to pass a non-existent variable to page.evaluate(). You can fix this by passing tds[i] instead of td:
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
Your final result should look something like this:
const go = async () => {
try {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto('www.webpage.com');
const tds = await page.$$('td');
const data = [];
for (let i = 0; i < tds.length; i++) {
const tdcontent = await page.evaluate(td => td.innerText, tds[i]);
if (tdcontent.length > 5) {
data[i] = {
content: tdcontent,
};
}
}
return data;
} catch (error) {
console.log(error);
}
};
(async function main() {
const returnedData = await go();
console.log(returnedData.length);
})();
If you are are still experiencing issues, you may want to wait until the page has loaded completely using page.goto( ... , { waitUntil: 'networkidle0' }), or wait until the element in question has been added to the DOM using page.waitForSelector():
await page.goto('www.webpage.com' , {
waitUntil: 'networkidle0',
});
// ...
await page.waitForSelector('td');

Resources