puppeteer-cluster seems to act in serial instead of parallel - node.js

I made an cluster of puppeteer workers using puppeteer cluster,
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
puppeteerOptions: {
userDataDir: path.join(__dirname,'user_data/1'),
headless: false,
args: ['--no-sandbox']
},
maxConcurrency: maxCon,
monitor: true,
skipDuplicateUrls: true,
timeout:40000000,
retryLimit:5,
});
I then passes some urls using queue through a for loop iterating over an array of urls.
The task is to capture screenshots of some websites. When I launch the script it works as intended, but instead of working parallelly, it seems it works serially.
When capturing screenshots I can see browser goes through tab by tab, takes a SS then to next tab and so on.
What can I do to make it work parallelly?
Full Code :
const puppeteer = require('puppeteer');
const { Cluster } = require('puppeteer-cluster');
const fs = require('fs');
const path = require('path');
var pdfkit = require('pdfkit');
//const zip = require('./zip_files');
//const cfolder = require('./create_folders');
const site = 'scribd.com';
const docType = ['pdf', 'word', 'spreadsheet'];
const t_out = 10000;
const wait = ms => new Promise(res => setTimeout(res, ms));
const scrnDir = 'screenshots';
const docDir = 'documents';
const zipDir = 'zips';
var data_1 = ['Exporter'];
var data_2 = [];
(async() => {
const browser = await puppeteer.launch({
headless: false,
userDataDir: path.join(__dirname,'user_data/main'),
});
const page = (await browser.pages())[0];
for(let i = 0; i < data_1.length; i++){
// for(let j = 0; j < data_2.length; j++){
var numFiles = 1000000;
let folder = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
let searchTerm = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
for(let pageNum = 1; pageNum < 2/*(Math.ceil(numFiles/42) +1)*/ && pageNum < 236; pageNum++){
//maxPageNum = 235
let docType = 'pdf';
let query = 'https://www.scribd.com/search?query='+searchTerm+'&content_type=documents&page='+pageNum+'&filetype='+docType;
await page.goto(query, {waitUntil : 'networkidle2'});
//await cfolder.createFolder(docDir, searchTerm);
fs.appendFileSync('progress/query.txt', query + '\n');
if(pageNum == 1){
let numFiles = await fileCount(page);
}
let docPages = await page.waitForXPath('//section[#data-testid="search-results"]', { timeout: t_out }).then(async() => {
let searchResults = await page.$x('//section[#data-testid="search-results"]');
await searchResults[0].waitForXPath('//div/ul/li');
let docPages = await searchResults[0].$x('//div/ul/li');
return docPages;
}).catch( e => {
console.log('getLinks Error');
console.log(e);
});
await save(browser, searchTerm, docPages);
}
//await zip.zipFolder(docDir + '/' + folder, zipDir + '/' + searchTerm + '.zip');
// }
}
})();
async function save(browser, searchTerm, docPages){
//let docPage = await browser.newPage();
let maxCon = 3;
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
puppeteerOptions: {
userDataDir: path.join(__dirname,'user_data/1'),
headless: false,
args: ['--no-sandbox']
},
maxConcurrency: maxCon,
monitor: true,
skipDuplicateUrls: true,
timeout:40000000,
retryLimit:5,
});
await cluster.task(async ({ page, data: {url, title} }) => {
let docPage = page;
await docPage.goto(url, {waitUntil: 'networkidle2'});
//await cfolder.createFolder(scrnDir, title);
await docPage.evaluate('document.querySelector(".nav_and_banners_fixed").remove()');
await docPage.evaluate('document.querySelector(".recommender_list_wrapper").remove()');
await docPage.evaluate('document.querySelector(".auto__doc_page_app_page_body_fixed_viewport_bottom_components").remove()');
//await autoScroll(docPage);
//await docPage.evaluate('document.querySelector(".wrapper__doc_page_webpack_doc_page_body_document_useful").remove()');
await docPage.addStyleTag({content: '.wrapper__doc_page_webpack_doc_page_body_document_useful{visibility: hidden}'})
await docPage.waitForXPath('//span[#class="page_of"]');
let numOfPagesR = await docPage.$x('//span[#class="page_of"]');
let numOfPages = parseInt((await (await numOfPagesR[0].getProperty('textContent')).jsonValue()).split('of ').pop());
console.log(numOfPages);
//const pages = await docPage.$x('//*[#class="newpage"]');
let imgs = [];
for(let j = 0; j < numOfPages; j++){
let sel = '//*[#id="page' + (j+1) + '"]';
let pages = await docPage.$x(sel);
await pages[0].screenshot({
path: scrnDir + '/' + title +j+'.jpg'
});
imgs[j] = title + j +'.jpg';
}
//await createPdf(searchTerm, title, imgs);
});
cluster.on('taskerror', (err, data) => {
console.log(` Error crawling ${data}: ${err.message}`);
});
for(let i = 0; i < 6/**docPages.length*/; i++){
await docPages[i].waitForXPath('//article/a');
let urlR = await docPages[i].$x('//article/a');
let url = await (await urlR[0].getProperty('href')).jsonValue();
await docPages[i].waitForXPath('//p[#data-e2e="title"]');
let titleR = await docPages[i].$x('//p[#data-e2e="title"]');
let title = await (await titleR[0].getProperty('textContent')).jsonValue();
cluster.queue({url : url, title : title});
//console.log(title);
}
await cluster.idle(); //docPage.close();
}
async function fileCount(page){
await page.waitForXPath('//div[#class="_7a1igU"]', { timeout: t_out }).then(async() => {
let fileCountR = await page.$x('//div[#class="_7a1igU"]');
let fileCountS = await (await fileCountR[0].getProperty('textContent')).jsonValue();
let numFiles = parseInt(fileCountS.split('of ').pop().split(' results').shift().replace(/,/g, ''));
console.log('Total Files : ' + numFiles);
return numFiles;
}).catch( e => {
console.log('File Count Error');
console.log(e);
});
}
async function getLinks(page){
}
async function createPdf(searchTerm, title, images){
//await cfolder.createFolder(docDir, searchTerm);
let pdf = new pdfkit({
autoFirstPage: false
});
let writeStream = fs.createWriteStream(docDir+ '/' + searchTerm + '/' + title + '.pdf');
pdf.pipe(writeStream);
for(let i = 0; i < images.length; i++){
let img = pdf.openImage('./' + scrnDir + '/' + title + '/' + images[i]);
pdf.addPage({size: [img.width, img.height]});
pdf.image(img, 0, 0);
}
pdf.end();
await new Promise(async (resolve) => {
writeStream.on('close', ()=>{
console.log('PDF Created succesfully');
resolve();
});
});
}
const zip = require('./zip_files');
const cfolder = require('./create_folders');
both require for the final code. But does not needed for the problem.

Related

wait until all the child processes are finished in NodeJS when using fork

I have a nodeJS script where it goes through a list of URLs to download pdfs. Each Url contains 5 pdfs. So I used fork to download all 5 pdfs at once. What I need to do is to move to next URL only and only after all child processes (downloads) finished executing.
this is my main.js
const puppeteer = require('puppeteer');
const fs = require('fs');
const fork = require('child_process').fork;
const ls = fork("download.js");
var list = [];
var links = [];
var names = [];
(async () => {
const browser = await puppeteer.launch(({headless: false}));
const page = await browser.newPage();
/*
const list_arr = fs.readFileSync('link_list.csv').toString().split(",");
for(l = 1; l < list_arr.length; l+2){
links[(l - 1) / 2] = list_arr[l];
names([l - 1] / 2) = list_arr[l - 1];
}
for(let i = 0; i < links.length; i++){*/
// url = links[i];
// name = names[i];
url = 'https://www.responsibilityreports.com/Company/abb-ltd';
name = 'abcd';
await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitForXPath('//li[#class="top_content_list"]/div[#class="left"]/span[#class="ticker_name"]');
let ticker_r = await page.$x('//li[#class="top_content_list"]/div[#class="left"]/span[#class="ticker_name"]');
let ticker = await (await ticker_r[0].getProperty('textContent')).jsonValue();
let cat = ticker[0].toLowerCase();
//console.log(ticker);
await page.waitForXPath('//li[#class="top_content_list"]/div[#class="right"]');
let exchange_r = await page.$x('//li[#class="top_content_list"]/div[#class="right"]');
let exchange = (await (await exchange_r[0].getProperty('textContent')).jsonValue()).split('Exchange').pop().split('More').shift().replace(/\n/gm, '').trim();
//console.log(exchange);
await page.waitForXPath('//div[#class="most_recent_content_block"]/span[#class="bold_txt"]');
let mr_res = await page.$x('//div[#class="most_recent_content_block"]/span[#class="bold_txt"]');
let mr_txt = await mr_res[0].getProperty('textContent');
let mr_text = await mr_txt.jsonValue();
console.log(mr_text);
let [mr_year, ...mr_rest] = mr_text.split(' ');
let mr_type = mr_rest.join(' ').trim();
mr_url = 'https://www.responsibilityreports.com/HostedData/ResponsibilityReports/PDF/'+ exchange + '_' + ticker + '_' + mr_year + '.pdf';
let mr_obj = {
"year" : mr_year,
"type" : mr_type.trim(),
"url" : mr_url
}
list.push(mr_obj);
await page.waitForXPath('//div[#class="archived_report_content_block"]/ul/li/div/span[#class="heading"]');
let ar_reps = await page.$x('//div[#class="archived_report_content_block"]/ul/li/div/span[#class="heading"]');
console.log(ar_reps.length);
for(let k = 0; k < ar_reps.length; k++){
let ar_txt = await ar_reps[k].getProperty('textContent');
let ar_text = await ar_txt.jsonValue();
let [ar_year, ...ar_rest] = ar_text.split(' ');
if(parseInt(ar_year) < 2017){
break;
}
let ar_type = ar_rest.join(' ');
ar_url = 'https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/' + cat + '/'+ exchange + '_' + ticker + '_' + ar_year + '.pdf';
let ar_obj = {
"year" : ar_year,
"type" : ar_type,
"url" : ar_url
}
list.push(ar_obj);
console.log(ar_year);
}
/*}*/
for(f = 0; f < list.length; f++){
let url_s = list[f].url;
let type_s = list[f].type;
let year_s = list[f].year;
ls.on('exit', (code)=>{
console.log('child_process exited with code ${code}');
});
ls.on('message', (msg) => {
ls.send([url_s ,name,type_s,year_s]);
console.log(msg);
});
}
await browser.close();
console.log('done');
})();
and child.js
const down_path = 'downloads/';
const https = require('https');
const fs = require('fs');
process.on('message', async (arr)=> {
console.log("CHILD: url received from parent process", arr);
url = arr[0];
name = arr[1];
type = arr[2];
year = arr[3];
await download(url,name,type,year);
});
process.send('Executed');
async function download(url,name,type,year) {
https.get(url, res => {
const stream = fs.createWriteStream(down_path + name + '_' + type + '_' + year + '.pdf');
res.pipe(stream);
stream.on('finish', async() => {
console.log('done : ' + year);
stream.close();
});
});
}
is there anyway I can modify my code so that it'll only go ahead once the child processes finish executing?
EDIT: I also found that child.js are not exiting after stream.close();

Puppeteer crashes 'Error: net::ERR_ABORTED at'

I have a NodeJS script using puppeteer to download few hundred thousand documents from a website. And Each file takes about 1.5s to get the download link. Due to high download count, and because of that massive amount of time it takes, I thought it would be a good idea to download multiple at a time.
So my idea was to open the list of pages the files contains using main.js, and send batches of x files to x child processes then open each in a new browser and then to download. (Due to the nature of this website, they don't have a direct download path, it generates the path on the process. So plugging download url directly is not an option.)
main
const fs = require('fs');
const puppeteer = require('puppeteer');
const path = require('path');
const fork = require('child_process').fork;
var zip = require ('./zip_files');
var fld = require('./create_folders');
var log = require('./login');
var ls = [];
const docType = ['pdf', 'word', 'spreadsheet'];
const docExt = ['.pdf', '.docx', '.xlsx'];
const login_url = 'https://www.scribd.com/login';
const login_email = '';
const login_password = '';
const t_out = 10000;
/*
Link Format
https://www.scribd.com/search?query=Exporters+in+Andra+Pradesh&content_type=documents&page=1&filetype=pdf
*/
var pages=[];
const datasetF1 = 'wordset_11.csv';
const datasetF2 = 'wordset_22.csv';
var data_1 = [];
var data_2 = [];
var fileCount = 1000000;
data_1 = fs.readFileSync(datasetF1).toString().split(',');
data_2 = fs.readFileSync(datasetF2).toString().split(',');
(async() => {
const browser = await puppeteer.launch(({
headless: false,
userDataDir: path.join(__dirname,'user_data/main'),
}));
console.log(browser)
const page = (await browser.pages())[0];
//await log.login(page, login_url, login_email, login_password, t_out);
child();
//await page.goto('https://www.scribd.com/', { waitUntil: 'networkidle2' });
for(let i = 0; i < data_1.length; i++){
for(let j = 0; j < data_2.length; j++){
let folder = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '') + ' ' + data_2[j].replace(/\n/gm, '').replace(/\r/gm, '');
let searchTerm = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '') + ' ' + data_2[j].replace(/\n/gm, '').replace(/\r/gm, '');
for(let pageNum = 1; pageNum < (Math.ceil(fileCount/42) +1) && pageNum < 236; pageNum++){
//maxPageNum = 235
let docType = 'pdf';
let query = 'https://www.scribd.com/search?query='+searchTerm+'&content_type=documents&page='+pageNum+'&filetype='+docType;
console.log(pageNum);
await page.goto(query, { waitUntil: 'networkidle2' });
fs.appendFileSync('progress/query.txt', query + '\n');
if (pageNum == 1){
await page.waitForXPath('//div[#class="_7a1igU"]', { timeout: t_out }).then(async() => {
let fileCountR = await page.$x('//div[#class="_7a1igU"]');
let fileCountS = await (await fileCountR[0].getProperty('textContent')).jsonValue();
var fileCount = parseInt(fileCountS.split('of ').pop().split(' results').shift().replace(/,/g, ''));
console.log(fileCount);
}).catch( e => {
console.log('ERROR1');
console.log(e);
//console.log(e);
});
}
await page.waitForXPath('//section[#data-testid="search-results"]', { timeout: t_out }).then(async() => {
let searchResults = await page.$x('//section[#data-testid="search-results"]');
await searchResults[0].waitForXPath('//div/ul/li');
let docPages = await searchResults[0].$x('//div/ul/li');
await download(browser, /*page,*/ docPages, folder);
}).catch( e => {
console.log('ERROR2');
console.log(e);
});
}
//await zip.zipFolder(down_path + folder, zip_path + folder + '.zip');
}
}
//browser.close();
})();
async function child(){
for(let a = 0; a < 3; a++){
ls[a] = fork('download.js');
}
}
async function download(browser, docPages, folder){
let child_count = 3;
const paged = await new Promise(async (resolve, reject) => {
for(let b = 0; b < docPages.length; b+=child_count){
await send_data(browser, docPages, folder, child_count, b);
console.log("b : b");
}
});
}
async function send_data(browser, docPages, folder, child_count, b){
let c = 0;
await new Promise(async (resolve, reject) => {
for(let a = 0; a < child_count; a++){
let urlR = await docPages[a+b].$x('//article/a');
let url = await (await urlR[0].getProperty('href')).jsonValue();
fs.appendFileSync('progress/page.txt', url + '\n');
ls[a].on('message', (msg) => {
if(msg == 'Executed'){
}if(msg == 'done'){
c++
if(c == (child_count-1)){
resolve({});
}
}
});
ls[a].send([url, folder, a]);
}
});
}
child (download.js)
const puppeteer = require('puppeteer');
//const page = (await browser.pages())[0];
const down_path = 'downloads/';
const error_path = 'errors/';
const fs = require('fs');
const path = require('path');
var log = require('./login');
const login_url = 'https://www.scribd.com/login';
const login_email = '';
const login_password = '';
const t_out = 1000;
const dowload_errorFile = 'errors.txt';
var browser;
var flag = 0;
const t_outforL = 100000;
process.on('message', async (obj)=> {
console.log("CHILD: data received from parent process", obj);
console.log("Process Starting : ", process.pid);
url = obj[0];
loc = obj[1];
ins = obj[2];
if(!flag){
browser = await puppeteer.launch(({
headless: false,
userDataDir: path.join(__dirname,'user_data/'+ins),
}));
const downPage = (await browser.pages())[0];
await downPage.goto('https://www.scribd.com', { waitUntil: 'networkidle2' });
flag = 1;
}
await download(url, loc);
});
process.send('Executed');
async function download(url, folder){
const downPage = (await browser.pages())[0];
await downPage.goto(`${url}`, { waitUntil: 'networkidle2' });
await downPage.waitForXPath('//button[#data-e2e="download-button"]', { timeout: 1000 }).then(async ()=>{
let dwnld = await downPage.$x('//button[#data-e2e="download-button"]');
const client = await downPage.target().createCDPSession()
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: path.join(__dirname, 'downloads/' + folder),
})
await dwnld[0].click();
await downPage.waitForXPath('//a[#class="wrapper__filled-button download_selection_btn"]');
let clicks = await downPage.$x('//a[#class="wrapper__filled-button download_selection_btn"]');
await clicks[0].click();
await downPage.waitForTimeout(500);
process.send('done');
}).catch( e => {
console.log('No download exists');
console.log(e);
});
await downPage.waitForTimeout(2500);
//process.exit();
}
It runs perfectly fine for a round or 2, but then puppeteer crashes with
Error: net::ERR_ABORTED at https://www.scribd.com/document/456861933
at navigate (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\node_modules\puppeteer-core\lib\cjs\puppeteer\common\Frame.js:238:23)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async Frame.goto (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\node_modules\puppeteer-core\lib\cjs\puppeteer\common\Frame.js:207:21)
at async CDPPage.goto (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\node_modules\puppeteer-core\lib\cjs\puppeteer\common\Page.js:439:16)
at async download (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\download.js:59:3)
at async process.<anonymous> (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\download.js:53:2)

Speed up scrappers

I have been scraping for some time now, and recently started using node and puppeteer for some projects. I build this scraper to collect telegram links from this crypto coin marketplace site. But it's kinda slow, but I don't really know where to start to figure out how to speed it up. So my question is, how do I learn to speed up my web scrappers without losing information that is collected??
Here is what I have now it tries to scrape the telegram links from about, 10000 different coin pages then saves those links to a csv.
const puppeteer = require('puppeteer');
const ObjectsToCsv = require('objects-to-csv');
(async () => {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null,
args: ['--start-maximized']
});
const page = await browser.newPage();
// const baseUrl = "https://coinmarketcap.com/"
let totalTelegramLinks = []
for (let i = 50; i < 101;i++){
await page.goto(`https://coinmarketcap.com/?page=${i}`, {waitUntil : 'networkidle2' }).catch(e => void 0);
console.log(`[+] Scraping Page ${i}`);
await autoScroll(page);
let allLinks = []
const grabedTableLinks = await page.evaluate(() => {
const aTags = Array.from(document.querySelectorAll('table.cmc-table tbody tr td div.sc-16r8icm-0.escjiH a.cmc-link'))
return aTags.map(a=>a.getAttribute('href'))
})
// allLinks.push([...new Set([...grabedTableLinks, ...allLinks])])
allLinks.push(...grabedTableLinks)
await page.screenshot({
path: 'yoursite.png',
fullPage: true
});
// console.log(allLinks);
console.log(allLinks.length);
// const await clickCoinLinks(page, allLinks)
totalTelegramLinks.push(...(await clickCoinLinks(page, allLinks)))
}
saveToFile(totalTelegramLinks)
console.log('\u0007')
await browser.close();
})();
const telegramRegex = new RegExp('(?:http|https):\/\/(?:t\.me|telegram\.me)\/.*')
const baseUrl = "https://coinmarketcap.com"
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}
async function clickCoinLinks(page, links){
let navigations = 0
let totalLinks = []
for (const url of links){
await page.goto(`${baseUrl}${url}`,{waitUntil : 'networkidle2' }).catch(e => void 0)
navigations++
const title = await page.title()
// console.log('---------')
// console.log(title)
const simpleLinkBtns = await page.$$('a.link-button')
let telegramLinks = await linkHandler(simpleLinkBtns, page)
if (telegramLinks.length){
totalLinks.push(...telegramLinks)
// telegramLinks.forEach(link => console.log(link))
}else{
// console.log('[-] No Immediate Link');
const hoverLinkBtns = await page.$$('button.link-button')
telegramLinks = await dropdownBtnHandler(hoverLinkBtns, page)
// console.log('Testing for dropdown link');
if (telegramLinks.length) totalLinks.push(...telegramLinks);
// telegramLinks ? telegramLinks.forEach(link => console.log(link)) : console.log('No dropdown Link either')
}
}
// console.log(totalLinks);
return totalLinks
}
const linkHandler = async (eleHandles, page)=>{
let linkUrls = []
for (const aTag of eleHandles){
linkUrls.push(await (await aTag.getProperty('href')).jsonValue())
}
const telegramLink = testLinks(linkUrls, page)
return telegramLink
}
async function dropdownBtnHandler(eleHandles, page){
let linkUrls = []
let telegramLink
for (const btn of eleHandles){
const btnText = await (await btn.getProperty('innerText')).jsonValue()
if(btnText == 'Chat'){
await btn.hover()
const dropdownLinks = await page.$$('li > a.dropdownItem')
for (const aTag of dropdownLinks){
const hrefVal = await (await aTag.getProperty('href')).jsonValue();
linkUrls.push(hrefVal)
}
telegramLink = testLinks(linkUrls, page)
}
}
return telegramLink ? telegramLink : []
}
const testLinks = async (links, page) =>{
const coin = await page.url().split('/').at(-2)
let telegramLinks = []
let coinLinks = []
links.forEach(link => {
if (telegramRegex.test(link)){
coinLinks.push(link)
}
})
// console.log(telegramLinks);
if(coinLinks.length){
const linkObj = {}
linkObj['coin'] = coin
linkObj['telegram_links'] = coinLinks
telegramLinks.push(linkObj)
}
return telegramLinks
}
const saveToFile = async (links) =>{
const csv = new ObjectsToCsv(links);
// Save to file:
await csv.toDisk('./telegram_links.csv');
// Return the CSV file as string:
// console.log(await csv.toString());
}

Why do I get zombie puppeteer processes on alpine/docker?

Here is the entirety of my puppeteer controller:
import { Readability } from '#mozilla/readability';
const puppeteer = require('puppeteer-extra');
const StealthPlugin = require('puppeteer-extra-plugin-stealth');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
const summarize = require('summarize');
const keyword_extractor = require('keyword-extractor');
const amex = require('../../csv/AMEX.json');
const nasdaq = require('../../csv/NASDAQ.json');
const nyse = require('../../csv/NYSE.json');
const cryptotickers = require('../../csv/cryptos.json');
puppeteer.use(StealthPlugin());
class Reader {
constructor() {
this.browser = null;
}
async getLink(link) {
this.browser = await puppeteer.launch({
devtools: false,
headless: true,
// product: 'firefox',
executablePath: '/usr/bin/chromium-browser',
args: [
'--proxy-server=' + process.env.PROXY_HOST,
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--single-process',
'--disable-setuid-sandbox',
'--no-zygote',
'--shm-size=4gb',
'--disable-infobars',
'--ignore-certifcate-errors',
'--ignore-certifcate-errors-spki-list',
// '--user-agent="Mozilla/5.0 (iPhone; CPU iPhone OS 14_3 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1"'
],
});
const { htm, title } = await this.spa(link);
if (!htm) {
await this.browser.close();
return;
}
const text = txt(htm, link);
const data = Object.assign({}, text);
const parts = new URL(link);
if (!data.title) {
data.title = title;
}
data.summary = summary(data.content, data.title);
data.tickers = tickers(data.content, data.textContent);
data.cryptos = cryptos(data.content, data.textContent);
data.meta = getMeta(htm);
if (!data.title && data.meta.title) {
data.title = data.meta.title;
}
data.url = link;
data.htm = htm;
data.host = parts.host;
data.text = data.textContent;
delete data.textContent;
console.log('data fetched: ' + link);
await this.browser.close();
// await this.browser.disconnect();
return data;
}
async spa(url) {
let htm;
let title;
try {
let page = await this.browser.newPage();
await page.setRequestInterception(true);
page.on('request', (req) => {
if (
req.resourceType() === 'stylesheet' ||
req.resourceType() === 'font' ||
req.resourceType() == 'image'
) {
req.abort();
} else {
req.continue();
}
});
await page.authenticate({
username: process.env.PROXY_USER,
password: process.env.PROXY_PASS,
});
await page.setViewport({ width: 800, height: 600 });
// await page.goto(url, { waitUntil: 'networkidle2' });
await page.goto(url, { waitUntil: 'domcontentloaded' });
await this.autoScroll(page);
await page.evaluate(() => window.scrollTo(0, 50));
htm = await page.content();
title = await page.evaluate(() => document.title);
if (htm.indexOf('<title') === -1) {
htm = await page.evaluate(() => document.documentElement.outerHTML);
}
console.log(title, 'title');
} catch (err) {
console.error(err, url);
}
return { htm, title };
}
async autoScroll(page) {
await page.evaluate(async () => {
new Promise((resolve, reject) => {
try {
const maxScroll = Number.MAX_SAFE_INTEGER;
let lastScroll = 0;
const interval = setInterval(() => {
window.scrollBy(0, document.body.offsetHeight);
const { scrollTop } = document.documentElement;
if (scrollTop === maxScroll || scrollTop === lastScroll) {
clearInterval(interval);
resolve();
} else {
lastScroll = scrollTop;
}
}, 1000);
} catch (error) {
reject(error);
}
}).catch((error) => {
console.error(error); // add catch here
});
});
// await page.evaluate(async () => {
// await new Promise((resolve, reject) => {
// let totalHeight = 0;
// let distance = 300;
// let timer = setInterval(() => {
// const scrollHeight = document.body.scrollHeight;
// window.scrollBy(0, distance);
// totalHeight += distance;
// if(totalHeight >= scrollHeight){
// clearInterval(timer);
// resolve();
// }
// }, 100);
// });
// });
}
} // end Class Reader
async function summarization2(text) {
let res;
let data;
console.log(text, process.env.DEEPAI_KEY);
try {
const body = new FormData();
body.append('text', text);
res = await fetch(`https://api.deepai.org/api/summarization`, {
method: 'POST',
body,
headers: {
'api-key': process.env.DEEPAI_KEY,
},
});
data = await res.json();
} catch (err) {
console.error(err);
}
return data;
}
async function sentiment(text) {
return await deepai.callStandardApi('sentiment-analysis', { text });
}
async function summarization(text) {
return await deepai.callStandardApi('summarization', { text }).catch(console.error);
}
function summary(text, title) {
if (!text) return {};
const summary = summarize(`${title} - ${text}`);
summary.topics = keyword_extractor
.extract(`${title} - ${text}`, {
language: 'english',
remove_digits: true,
return_changed_case: true,
remove_duplicates: false,
})
.map(process);
const counts = summary.topics.reduce(
(acc, value) => ({
...acc,
[value]: (acc[value] || 0) + 1,
}),
{},
);
let topics = [];
for (let topic in counts) {
topics.push({ topic, count: counts[topic] });
}
topics = topics.filter((t) => t.topic);
topics = topics.sort((a, b) => {
return b.count - a.count;
});
topics = topics.slice(0, 10);
topics = topics.map((topic) => topic.topic);
summary.topics = topics;
function process(topic) {
topic = topic.toLowerCase().trim();
topic = topic.replace(/[\W_]+/g, '');
topic = topic.replace(/\s+/g, '-');
return topic;
}
console.log('summary: ', summary);
return summary;
}
function tickers(htm, text) {
if (!text) return {};
const tickers = [];
function findTicker(ticker, exchange) {
let name = ticker.Name;
if (name && name.indexOf('Twitter') === -1 && name.indexOf('Facebook') === -1) {
name = name.replace(/,? ?Inc\.?/gi, '').replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
const regex = new RegExp(`\\b${name}\\b`, 'gi');
if (text.match(regex)) {
console.log(name);
console.log(regex.toString());
tickers.push({ name: ticker.Name, symbol: ticker.Symbol, exchange });
}
}
amex.forEach((ticker) => {
findTicker(ticker, 'amex');
});
nasdaq.forEach((ticker) => {
findTicker(ticker, 'nasdaq');
});
nyse.forEach((ticker) => {
findTicker(ticker, 'nyse');
});
console.log(tickers);
return tickers;
}
function cryptos(htm, text) {
if (!text) return {};
const tickers = [];
function findTicker(ticker) {
const name = ticker.name.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
const regex = new RegExp(`\\b${name}\\b`, 'g');
if (text.match(regex)) {
console.log(name);
console.log(regex.toString());
tickers.push({ name: ticker.name, symbol: ticker.symbol });
}
}
cryptotickers.forEach(findTicker);
console.log(tickers);
return tickers;
}
function getMeta(htm) {
const doc = new JSDOM(htm);
const meta = {};
const thumb =
doc.window.document.querySelector('meta[property="og:image"]') ||
doc.window.document.querySelector('meta[name="twitter:image"]');
const title = doc.window.document.title;
meta.title = title;
meta.thumb = thumb && thumb.getAttribute('content');
return meta;
}
function txt(htm, link) {
const url = new URL(link);
const doc = new JSDOM(htm);
doc.window.document
.querySelectorAll('img')
.forEach(
(el) =>
(el.src =
el.src.indexOf('http') === 0 || el.src.indexOf('//') === 0
? el.src.indexOf('http://')
? el.src.replace('http:', '')
: el.str
: '//' + url.host + el.src),
);
doc.window.document
.querySelectorAll('a[href]')
.forEach(
(el) =>
(el.href =
el.href && el.href.indexOf('/') === 0
? url.protocol + '//' + url.host + el.href
: el.href),
);
const reader = new Readability(doc.window.document);
return reader.parse();
}
export default Reader;
For some reason after a few days the docker container has too many puppeteer processes because for some reason when fetching urls the browser doesn't exit properly.
Eventually the container is out of resources and the entire app freezes and is inaccessible.
I had the same issue when using Puppeteer inside docker. The solution was to implement dumb-init within docker. The Dockerfile should somehow look like this then (I assume you are developing a node-project therefore we call npm start at the end:
RUN apt-get install dumb-init // ... plus your other packages
... your remaining docker things
ENTRYPOINT ["/usr/bin/dumb-init", "--"]
CMD [ "npm", "start" ]

How to use multiple link in .goto(url) puppeteer?

const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true});
const page = await browser.newPage();
await page.goto('url/c-0');
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + "chapter");
} catch (error) {
}
})();
Hi all, currently i wanna to loop then :
url/c-0'
url/c-1'
url/c-2'
.....
please give me solutions thanks all.
Just loop your job. You could create a forloop to loop all chapters which you want to crawl (if your chapter urls have the same format).
const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
const endOfChapterNumber = 10; // number of chapters
for (const c = 0; c <= endOfChapterNumber; c++) {
const chapterUrl = 'url/c-' + c;
await page.goto(chapterUrl);
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + " chapter: " + c);
}
} catch (error) {
}
})();

Resources