Puppeteer crashes 'Error: net::ERR_ABORTED at' - node.js

I have a NodeJS script using puppeteer to download few hundred thousand documents from a website. And Each file takes about 1.5s to get the download link. Due to high download count, and because of that massive amount of time it takes, I thought it would be a good idea to download multiple at a time.
So my idea was to open the list of pages the files contains using main.js, and send batches of x files to x child processes then open each in a new browser and then to download. (Due to the nature of this website, they don't have a direct download path, it generates the path on the process. So plugging download url directly is not an option.)
main
const fs = require('fs');
const puppeteer = require('puppeteer');
const path = require('path');
const fork = require('child_process').fork;
var zip = require ('./zip_files');
var fld = require('./create_folders');
var log = require('./login');
var ls = [];
const docType = ['pdf', 'word', 'spreadsheet'];
const docExt = ['.pdf', '.docx', '.xlsx'];
const login_url = 'https://www.scribd.com/login';
const login_email = '';
const login_password = '';
const t_out = 10000;
/*
Link Format
https://www.scribd.com/search?query=Exporters+in+Andra+Pradesh&content_type=documents&page=1&filetype=pdf
*/
var pages=[];
const datasetF1 = 'wordset_11.csv';
const datasetF2 = 'wordset_22.csv';
var data_1 = [];
var data_2 = [];
var fileCount = 1000000;
data_1 = fs.readFileSync(datasetF1).toString().split(',');
data_2 = fs.readFileSync(datasetF2).toString().split(',');
(async() => {
const browser = await puppeteer.launch(({
headless: false,
userDataDir: path.join(__dirname,'user_data/main'),
}));
console.log(browser)
const page = (await browser.pages())[0];
//await log.login(page, login_url, login_email, login_password, t_out);
child();
//await page.goto('https://www.scribd.com/', { waitUntil: 'networkidle2' });
for(let i = 0; i < data_1.length; i++){
for(let j = 0; j < data_2.length; j++){
let folder = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '') + ' ' + data_2[j].replace(/\n/gm, '').replace(/\r/gm, '');
let searchTerm = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '') + ' ' + data_2[j].replace(/\n/gm, '').replace(/\r/gm, '');
for(let pageNum = 1; pageNum < (Math.ceil(fileCount/42) +1) && pageNum < 236; pageNum++){
//maxPageNum = 235
let docType = 'pdf';
let query = 'https://www.scribd.com/search?query='+searchTerm+'&content_type=documents&page='+pageNum+'&filetype='+docType;
console.log(pageNum);
await page.goto(query, { waitUntil: 'networkidle2' });
fs.appendFileSync('progress/query.txt', query + '\n');
if (pageNum == 1){
await page.waitForXPath('//div[#class="_7a1igU"]', { timeout: t_out }).then(async() => {
let fileCountR = await page.$x('//div[#class="_7a1igU"]');
let fileCountS = await (await fileCountR[0].getProperty('textContent')).jsonValue();
var fileCount = parseInt(fileCountS.split('of ').pop().split(' results').shift().replace(/,/g, ''));
console.log(fileCount);
}).catch( e => {
console.log('ERROR1');
console.log(e);
//console.log(e);
});
}
await page.waitForXPath('//section[#data-testid="search-results"]', { timeout: t_out }).then(async() => {
let searchResults = await page.$x('//section[#data-testid="search-results"]');
await searchResults[0].waitForXPath('//div/ul/li');
let docPages = await searchResults[0].$x('//div/ul/li');
await download(browser, /*page,*/ docPages, folder);
}).catch( e => {
console.log('ERROR2');
console.log(e);
});
}
//await zip.zipFolder(down_path + folder, zip_path + folder + '.zip');
}
}
//browser.close();
})();
async function child(){
for(let a = 0; a < 3; a++){
ls[a] = fork('download.js');
}
}
async function download(browser, docPages, folder){
let child_count = 3;
const paged = await new Promise(async (resolve, reject) => {
for(let b = 0; b < docPages.length; b+=child_count){
await send_data(browser, docPages, folder, child_count, b);
console.log("b : b");
}
});
}
async function send_data(browser, docPages, folder, child_count, b){
let c = 0;
await new Promise(async (resolve, reject) => {
for(let a = 0; a < child_count; a++){
let urlR = await docPages[a+b].$x('//article/a');
let url = await (await urlR[0].getProperty('href')).jsonValue();
fs.appendFileSync('progress/page.txt', url + '\n');
ls[a].on('message', (msg) => {
if(msg == 'Executed'){
}if(msg == 'done'){
c++
if(c == (child_count-1)){
resolve({});
}
}
});
ls[a].send([url, folder, a]);
}
});
}
child (download.js)
const puppeteer = require('puppeteer');
//const page = (await browser.pages())[0];
const down_path = 'downloads/';
const error_path = 'errors/';
const fs = require('fs');
const path = require('path');
var log = require('./login');
const login_url = 'https://www.scribd.com/login';
const login_email = '';
const login_password = '';
const t_out = 1000;
const dowload_errorFile = 'errors.txt';
var browser;
var flag = 0;
const t_outforL = 100000;
process.on('message', async (obj)=> {
console.log("CHILD: data received from parent process", obj);
console.log("Process Starting : ", process.pid);
url = obj[0];
loc = obj[1];
ins = obj[2];
if(!flag){
browser = await puppeteer.launch(({
headless: false,
userDataDir: path.join(__dirname,'user_data/'+ins),
}));
const downPage = (await browser.pages())[0];
await downPage.goto('https://www.scribd.com', { waitUntil: 'networkidle2' });
flag = 1;
}
await download(url, loc);
});
process.send('Executed');
async function download(url, folder){
const downPage = (await browser.pages())[0];
await downPage.goto(`${url}`, { waitUntil: 'networkidle2' });
await downPage.waitForXPath('//button[#data-e2e="download-button"]', { timeout: 1000 }).then(async ()=>{
let dwnld = await downPage.$x('//button[#data-e2e="download-button"]');
const client = await downPage.target().createCDPSession()
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: path.join(__dirname, 'downloads/' + folder),
})
await dwnld[0].click();
await downPage.waitForXPath('//a[#class="wrapper__filled-button download_selection_btn"]');
let clicks = await downPage.$x('//a[#class="wrapper__filled-button download_selection_btn"]');
await clicks[0].click();
await downPage.waitForTimeout(500);
process.send('done');
}).catch( e => {
console.log('No download exists');
console.log(e);
});
await downPage.waitForTimeout(2500);
//process.exit();
}
It runs perfectly fine for a round or 2, but then puppeteer crashes with
Error: net::ERR_ABORTED at https://www.scribd.com/document/456861933
at navigate (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\node_modules\puppeteer-core\lib\cjs\puppeteer\common\Frame.js:238:23)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async Frame.goto (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\node_modules\puppeteer-core\lib\cjs\puppeteer\common\Frame.js:207:21)
at async CDPPage.goto (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\node_modules\puppeteer-core\lib\cjs\puppeteer\common\Page.js:439:16)
at async download (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\download.js:59:3)
at async process.<anonymous> (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\download.js:53:2)

Related

puppeteer-cluster seems to act in serial instead of parallel

I made an cluster of puppeteer workers using puppeteer cluster,
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
puppeteerOptions: {
userDataDir: path.join(__dirname,'user_data/1'),
headless: false,
args: ['--no-sandbox']
},
maxConcurrency: maxCon,
monitor: true,
skipDuplicateUrls: true,
timeout:40000000,
retryLimit:5,
});
I then passes some urls using queue through a for loop iterating over an array of urls.
The task is to capture screenshots of some websites. When I launch the script it works as intended, but instead of working parallelly, it seems it works serially.
When capturing screenshots I can see browser goes through tab by tab, takes a SS then to next tab and so on.
What can I do to make it work parallelly?
Full Code :
const puppeteer = require('puppeteer');
const { Cluster } = require('puppeteer-cluster');
const fs = require('fs');
const path = require('path');
var pdfkit = require('pdfkit');
//const zip = require('./zip_files');
//const cfolder = require('./create_folders');
const site = 'scribd.com';
const docType = ['pdf', 'word', 'spreadsheet'];
const t_out = 10000;
const wait = ms => new Promise(res => setTimeout(res, ms));
const scrnDir = 'screenshots';
const docDir = 'documents';
const zipDir = 'zips';
var data_1 = ['Exporter'];
var data_2 = [];
(async() => {
const browser = await puppeteer.launch({
headless: false,
userDataDir: path.join(__dirname,'user_data/main'),
});
const page = (await browser.pages())[0];
for(let i = 0; i < data_1.length; i++){
// for(let j = 0; j < data_2.length; j++){
var numFiles = 1000000;
let folder = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
let searchTerm = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '');
for(let pageNum = 1; pageNum < 2/*(Math.ceil(numFiles/42) +1)*/ && pageNum < 236; pageNum++){
//maxPageNum = 235
let docType = 'pdf';
let query = 'https://www.scribd.com/search?query='+searchTerm+'&content_type=documents&page='+pageNum+'&filetype='+docType;
await page.goto(query, {waitUntil : 'networkidle2'});
//await cfolder.createFolder(docDir, searchTerm);
fs.appendFileSync('progress/query.txt', query + '\n');
if(pageNum == 1){
let numFiles = await fileCount(page);
}
let docPages = await page.waitForXPath('//section[#data-testid="search-results"]', { timeout: t_out }).then(async() => {
let searchResults = await page.$x('//section[#data-testid="search-results"]');
await searchResults[0].waitForXPath('//div/ul/li');
let docPages = await searchResults[0].$x('//div/ul/li');
return docPages;
}).catch( e => {
console.log('getLinks Error');
console.log(e);
});
await save(browser, searchTerm, docPages);
}
//await zip.zipFolder(docDir + '/' + folder, zipDir + '/' + searchTerm + '.zip');
// }
}
})();
async function save(browser, searchTerm, docPages){
//let docPage = await browser.newPage();
let maxCon = 3;
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
puppeteerOptions: {
userDataDir: path.join(__dirname,'user_data/1'),
headless: false,
args: ['--no-sandbox']
},
maxConcurrency: maxCon,
monitor: true,
skipDuplicateUrls: true,
timeout:40000000,
retryLimit:5,
});
await cluster.task(async ({ page, data: {url, title} }) => {
let docPage = page;
await docPage.goto(url, {waitUntil: 'networkidle2'});
//await cfolder.createFolder(scrnDir, title);
await docPage.evaluate('document.querySelector(".nav_and_banners_fixed").remove()');
await docPage.evaluate('document.querySelector(".recommender_list_wrapper").remove()');
await docPage.evaluate('document.querySelector(".auto__doc_page_app_page_body_fixed_viewport_bottom_components").remove()');
//await autoScroll(docPage);
//await docPage.evaluate('document.querySelector(".wrapper__doc_page_webpack_doc_page_body_document_useful").remove()');
await docPage.addStyleTag({content: '.wrapper__doc_page_webpack_doc_page_body_document_useful{visibility: hidden}'})
await docPage.waitForXPath('//span[#class="page_of"]');
let numOfPagesR = await docPage.$x('//span[#class="page_of"]');
let numOfPages = parseInt((await (await numOfPagesR[0].getProperty('textContent')).jsonValue()).split('of ').pop());
console.log(numOfPages);
//const pages = await docPage.$x('//*[#class="newpage"]');
let imgs = [];
for(let j = 0; j < numOfPages; j++){
let sel = '//*[#id="page' + (j+1) + '"]';
let pages = await docPage.$x(sel);
await pages[0].screenshot({
path: scrnDir + '/' + title +j+'.jpg'
});
imgs[j] = title + j +'.jpg';
}
//await createPdf(searchTerm, title, imgs);
});
cluster.on('taskerror', (err, data) => {
console.log(` Error crawling ${data}: ${err.message}`);
});
for(let i = 0; i < 6/**docPages.length*/; i++){
await docPages[i].waitForXPath('//article/a');
let urlR = await docPages[i].$x('//article/a');
let url = await (await urlR[0].getProperty('href')).jsonValue();
await docPages[i].waitForXPath('//p[#data-e2e="title"]');
let titleR = await docPages[i].$x('//p[#data-e2e="title"]');
let title = await (await titleR[0].getProperty('textContent')).jsonValue();
cluster.queue({url : url, title : title});
//console.log(title);
}
await cluster.idle(); //docPage.close();
}
async function fileCount(page){
await page.waitForXPath('//div[#class="_7a1igU"]', { timeout: t_out }).then(async() => {
let fileCountR = await page.$x('//div[#class="_7a1igU"]');
let fileCountS = await (await fileCountR[0].getProperty('textContent')).jsonValue();
let numFiles = parseInt(fileCountS.split('of ').pop().split(' results').shift().replace(/,/g, ''));
console.log('Total Files : ' + numFiles);
return numFiles;
}).catch( e => {
console.log('File Count Error');
console.log(e);
});
}
async function getLinks(page){
}
async function createPdf(searchTerm, title, images){
//await cfolder.createFolder(docDir, searchTerm);
let pdf = new pdfkit({
autoFirstPage: false
});
let writeStream = fs.createWriteStream(docDir+ '/' + searchTerm + '/' + title + '.pdf');
pdf.pipe(writeStream);
for(let i = 0; i < images.length; i++){
let img = pdf.openImage('./' + scrnDir + '/' + title + '/' + images[i]);
pdf.addPage({size: [img.width, img.height]});
pdf.image(img, 0, 0);
}
pdf.end();
await new Promise(async (resolve) => {
writeStream.on('close', ()=>{
console.log('PDF Created succesfully');
resolve();
});
});
}
const zip = require('./zip_files');
const cfolder = require('./create_folders');
both require for the final code. But does not needed for the problem.

wait until all the child processes are finished in NodeJS when using fork

I have a nodeJS script where it goes through a list of URLs to download pdfs. Each Url contains 5 pdfs. So I used fork to download all 5 pdfs at once. What I need to do is to move to next URL only and only after all child processes (downloads) finished executing.
this is my main.js
const puppeteer = require('puppeteer');
const fs = require('fs');
const fork = require('child_process').fork;
const ls = fork("download.js");
var list = [];
var links = [];
var names = [];
(async () => {
const browser = await puppeteer.launch(({headless: false}));
const page = await browser.newPage();
/*
const list_arr = fs.readFileSync('link_list.csv').toString().split(",");
for(l = 1; l < list_arr.length; l+2){
links[(l - 1) / 2] = list_arr[l];
names([l - 1] / 2) = list_arr[l - 1];
}
for(let i = 0; i < links.length; i++){*/
// url = links[i];
// name = names[i];
url = 'https://www.responsibilityreports.com/Company/abb-ltd';
name = 'abcd';
await page.goto(url, { waitUntil: 'networkidle2' });
await page.waitForXPath('//li[#class="top_content_list"]/div[#class="left"]/span[#class="ticker_name"]');
let ticker_r = await page.$x('//li[#class="top_content_list"]/div[#class="left"]/span[#class="ticker_name"]');
let ticker = await (await ticker_r[0].getProperty('textContent')).jsonValue();
let cat = ticker[0].toLowerCase();
//console.log(ticker);
await page.waitForXPath('//li[#class="top_content_list"]/div[#class="right"]');
let exchange_r = await page.$x('//li[#class="top_content_list"]/div[#class="right"]');
let exchange = (await (await exchange_r[0].getProperty('textContent')).jsonValue()).split('Exchange').pop().split('More').shift().replace(/\n/gm, '').trim();
//console.log(exchange);
await page.waitForXPath('//div[#class="most_recent_content_block"]/span[#class="bold_txt"]');
let mr_res = await page.$x('//div[#class="most_recent_content_block"]/span[#class="bold_txt"]');
let mr_txt = await mr_res[0].getProperty('textContent');
let mr_text = await mr_txt.jsonValue();
console.log(mr_text);
let [mr_year, ...mr_rest] = mr_text.split(' ');
let mr_type = mr_rest.join(' ').trim();
mr_url = 'https://www.responsibilityreports.com/HostedData/ResponsibilityReports/PDF/'+ exchange + '_' + ticker + '_' + mr_year + '.pdf';
let mr_obj = {
"year" : mr_year,
"type" : mr_type.trim(),
"url" : mr_url
}
list.push(mr_obj);
await page.waitForXPath('//div[#class="archived_report_content_block"]/ul/li/div/span[#class="heading"]');
let ar_reps = await page.$x('//div[#class="archived_report_content_block"]/ul/li/div/span[#class="heading"]');
console.log(ar_reps.length);
for(let k = 0; k < ar_reps.length; k++){
let ar_txt = await ar_reps[k].getProperty('textContent');
let ar_text = await ar_txt.jsonValue();
let [ar_year, ...ar_rest] = ar_text.split(' ');
if(parseInt(ar_year) < 2017){
break;
}
let ar_type = ar_rest.join(' ');
ar_url = 'https://www.responsibilityreports.com/HostedData/ResponsibilityReportArchive/' + cat + '/'+ exchange + '_' + ticker + '_' + ar_year + '.pdf';
let ar_obj = {
"year" : ar_year,
"type" : ar_type,
"url" : ar_url
}
list.push(ar_obj);
console.log(ar_year);
}
/*}*/
for(f = 0; f < list.length; f++){
let url_s = list[f].url;
let type_s = list[f].type;
let year_s = list[f].year;
ls.on('exit', (code)=>{
console.log('child_process exited with code ${code}');
});
ls.on('message', (msg) => {
ls.send([url_s ,name,type_s,year_s]);
console.log(msg);
});
}
await browser.close();
console.log('done');
})();
and child.js
const down_path = 'downloads/';
const https = require('https');
const fs = require('fs');
process.on('message', async (arr)=> {
console.log("CHILD: url received from parent process", arr);
url = arr[0];
name = arr[1];
type = arr[2];
year = arr[3];
await download(url,name,type,year);
});
process.send('Executed');
async function download(url,name,type,year) {
https.get(url, res => {
const stream = fs.createWriteStream(down_path + name + '_' + type + '_' + year + '.pdf');
res.pipe(stream);
stream.on('finish', async() => {
console.log('done : ' + year);
stream.close();
});
});
}
is there anyway I can modify my code so that it'll only go ahead once the child processes finish executing?
EDIT: I also found that child.js are not exiting after stream.close();

How to use Node.js crawler Web

I expecting the product info will be printed when displayed. However, the current code will show all items loaded even if they're not shown yet.
How do i modify my code, thank you
// const request = require("request");
const cheerio = require("cheerio");
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false // 無外殼的 Chrome,有更佳的效能
});
const page = await browser.newPage();
await page.goto('https://www.balenciaga.com/en-us/women/shoes/sneakers');
await getData(page)
await scrollItem(page)
})();
const scrollItem = async (page) => {
pageHeight = await page.evaluate('document.body.scrollHeight')
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)',
await page.waitForFunction(`document.body.scrollHeight > ${pageHeight}`),
await getData(page)
)
}
const getData = async (page) => {
let body = await page.content()
let $ = await cheerio.load(body)
const data = []
const list = $(".l-productgrid__item .c-product__infos");
for (let i = 0; i < list.length; i++) {
const title = list.eq(i).find('.c-product__infos h2').text();
const price = list.eq(i).find('.c-product__infos p').text().trim();
data.push({ title, price });
}
data.forEach((res, i) => {
console.log(`${i+1} 名稱: ${res.title}, 價錢: ${res.price}`)
})
await scrollItem(page)
}
working code:
// define function which accepts body and cheerio as args
function extract(input, cheerio) {
// return object with extracted values
let $ = cheerio.load(input);
return $('.l-productgrid__item .c-product__infos').map(function() {
return {
header: $('h2', this).text().trim(),
price: $('p', this).text().trim()
}
}).toArray()
}
proof of work (screenshot)

Speed up scrappers

I have been scraping for some time now, and recently started using node and puppeteer for some projects. I build this scraper to collect telegram links from this crypto coin marketplace site. But it's kinda slow, but I don't really know where to start to figure out how to speed it up. So my question is, how do I learn to speed up my web scrappers without losing information that is collected??
Here is what I have now it tries to scrape the telegram links from about, 10000 different coin pages then saves those links to a csv.
const puppeteer = require('puppeteer');
const ObjectsToCsv = require('objects-to-csv');
(async () => {
const browser = await puppeteer.launch({
headless: true,
defaultViewport: null,
args: ['--start-maximized']
});
const page = await browser.newPage();
// const baseUrl = "https://coinmarketcap.com/"
let totalTelegramLinks = []
for (let i = 50; i < 101;i++){
await page.goto(`https://coinmarketcap.com/?page=${i}`, {waitUntil : 'networkidle2' }).catch(e => void 0);
console.log(`[+] Scraping Page ${i}`);
await autoScroll(page);
let allLinks = []
const grabedTableLinks = await page.evaluate(() => {
const aTags = Array.from(document.querySelectorAll('table.cmc-table tbody tr td div.sc-16r8icm-0.escjiH a.cmc-link'))
return aTags.map(a=>a.getAttribute('href'))
})
// allLinks.push([...new Set([...grabedTableLinks, ...allLinks])])
allLinks.push(...grabedTableLinks)
await page.screenshot({
path: 'yoursite.png',
fullPage: true
});
// console.log(allLinks);
console.log(allLinks.length);
// const await clickCoinLinks(page, allLinks)
totalTelegramLinks.push(...(await clickCoinLinks(page, allLinks)))
}
saveToFile(totalTelegramLinks)
console.log('\u0007')
await browser.close();
})();
const telegramRegex = new RegExp('(?:http|https):\/\/(?:t\.me|telegram\.me)\/.*')
const baseUrl = "https://coinmarketcap.com"
async function autoScroll(page){
await page.evaluate(async () => {
await new Promise((resolve, reject) => {
var totalHeight = 0;
var distance = 100;
var timer = setInterval(() => {
var scrollHeight = document.body.scrollHeight;
window.scrollBy(0, distance);
totalHeight += distance;
if(totalHeight >= scrollHeight - window.innerHeight){
clearInterval(timer);
resolve();
}
}, 100);
});
});
}
async function clickCoinLinks(page, links){
let navigations = 0
let totalLinks = []
for (const url of links){
await page.goto(`${baseUrl}${url}`,{waitUntil : 'networkidle2' }).catch(e => void 0)
navigations++
const title = await page.title()
// console.log('---------')
// console.log(title)
const simpleLinkBtns = await page.$$('a.link-button')
let telegramLinks = await linkHandler(simpleLinkBtns, page)
if (telegramLinks.length){
totalLinks.push(...telegramLinks)
// telegramLinks.forEach(link => console.log(link))
}else{
// console.log('[-] No Immediate Link');
const hoverLinkBtns = await page.$$('button.link-button')
telegramLinks = await dropdownBtnHandler(hoverLinkBtns, page)
// console.log('Testing for dropdown link');
if (telegramLinks.length) totalLinks.push(...telegramLinks);
// telegramLinks ? telegramLinks.forEach(link => console.log(link)) : console.log('No dropdown Link either')
}
}
// console.log(totalLinks);
return totalLinks
}
const linkHandler = async (eleHandles, page)=>{
let linkUrls = []
for (const aTag of eleHandles){
linkUrls.push(await (await aTag.getProperty('href')).jsonValue())
}
const telegramLink = testLinks(linkUrls, page)
return telegramLink
}
async function dropdownBtnHandler(eleHandles, page){
let linkUrls = []
let telegramLink
for (const btn of eleHandles){
const btnText = await (await btn.getProperty('innerText')).jsonValue()
if(btnText == 'Chat'){
await btn.hover()
const dropdownLinks = await page.$$('li > a.dropdownItem')
for (const aTag of dropdownLinks){
const hrefVal = await (await aTag.getProperty('href')).jsonValue();
linkUrls.push(hrefVal)
}
telegramLink = testLinks(linkUrls, page)
}
}
return telegramLink ? telegramLink : []
}
const testLinks = async (links, page) =>{
const coin = await page.url().split('/').at(-2)
let telegramLinks = []
let coinLinks = []
links.forEach(link => {
if (telegramRegex.test(link)){
coinLinks.push(link)
}
})
// console.log(telegramLinks);
if(coinLinks.length){
const linkObj = {}
linkObj['coin'] = coin
linkObj['telegram_links'] = coinLinks
telegramLinks.push(linkObj)
}
return telegramLinks
}
const saveToFile = async (links) =>{
const csv = new ObjectsToCsv(links);
// Save to file:
await csv.toDisk('./telegram_links.csv');
// Return the CSV file as string:
// console.log(await csv.toString());
}

How to use multiple link in .goto(url) puppeteer?

const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true});
const page = await browser.newPage();
await page.goto('url/c-0');
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + "chapter");
} catch (error) {
}
})();
Hi all, currently i wanna to loop then :
url/c-0'
url/c-1'
url/c-2'
.....
please give me solutions thanks all.
Just loop your job. You could create a forloop to loop all chapters which you want to crawl (if your chapter urls have the same format).
const puppeteer = require("puppeteer");
(async () => {
try {
const browser = await puppeteer.launch({ headless: true });
const page = await browser.newPage();
const endOfChapterNumber = 10; // number of chapters
for (const c = 0; c <= endOfChapterNumber; c++) {
const chapterUrl = 'url/c-' + c;
await page.goto(chapterUrl);
await page.waitForSelector('.box-chap');
const element = await page.$(".box-chap");
const content = await page.evaluate(element => element.textContent, element);
console.log(content + " chapter: " + c);
}
} catch (error) {
}
})();

Resources