I have a NodeJS script using puppeteer to download few hundred thousand documents from a website. And Each file takes about 1.5s to get the download link. Due to high download count, and because of that massive amount of time it takes, I thought it would be a good idea to download multiple at a time.
So my idea was to open the list of pages the files contains using main.js, and send batches of x files to x child processes then open each in a new browser and then to download. (Due to the nature of this website, they don't have a direct download path, it generates the path on the process. So plugging download url directly is not an option.)
main
const fs = require('fs');
const puppeteer = require('puppeteer');
const path = require('path');
const fork = require('child_process').fork;
var zip = require ('./zip_files');
var fld = require('./create_folders');
var log = require('./login');
var ls = [];
const docType = ['pdf', 'word', 'spreadsheet'];
const docExt = ['.pdf', '.docx', '.xlsx'];
const login_url = 'https://www.scribd.com/login';
const login_email = '';
const login_password = '';
const t_out = 10000;
/*
Link Format
https://www.scribd.com/search?query=Exporters+in+Andra+Pradesh&content_type=documents&page=1&filetype=pdf
*/
var pages=[];
const datasetF1 = 'wordset_11.csv';
const datasetF2 = 'wordset_22.csv';
var data_1 = [];
var data_2 = [];
var fileCount = 1000000;
data_1 = fs.readFileSync(datasetF1).toString().split(',');
data_2 = fs.readFileSync(datasetF2).toString().split(',');
(async() => {
const browser = await puppeteer.launch(({
headless: false,
userDataDir: path.join(__dirname,'user_data/main'),
}));
console.log(browser)
const page = (await browser.pages())[0];
//await log.login(page, login_url, login_email, login_password, t_out);
child();
//await page.goto('https://www.scribd.com/', { waitUntil: 'networkidle2' });
for(let i = 0; i < data_1.length; i++){
for(let j = 0; j < data_2.length; j++){
let folder = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '') + ' ' + data_2[j].replace(/\n/gm, '').replace(/\r/gm, '');
let searchTerm = data_1[i].replace(/\n/gm, '').replace(/\r/gm, '') + ' ' + data_2[j].replace(/\n/gm, '').replace(/\r/gm, '');
for(let pageNum = 1; pageNum < (Math.ceil(fileCount/42) +1) && pageNum < 236; pageNum++){
//maxPageNum = 235
let docType = 'pdf';
let query = 'https://www.scribd.com/search?query='+searchTerm+'&content_type=documents&page='+pageNum+'&filetype='+docType;
console.log(pageNum);
await page.goto(query, { waitUntil: 'networkidle2' });
fs.appendFileSync('progress/query.txt', query + '\n');
if (pageNum == 1){
await page.waitForXPath('//div[#class="_7a1igU"]', { timeout: t_out }).then(async() => {
let fileCountR = await page.$x('//div[#class="_7a1igU"]');
let fileCountS = await (await fileCountR[0].getProperty('textContent')).jsonValue();
var fileCount = parseInt(fileCountS.split('of ').pop().split(' results').shift().replace(/,/g, ''));
console.log(fileCount);
}).catch( e => {
console.log('ERROR1');
console.log(e);
//console.log(e);
});
}
await page.waitForXPath('//section[#data-testid="search-results"]', { timeout: t_out }).then(async() => {
let searchResults = await page.$x('//section[#data-testid="search-results"]');
await searchResults[0].waitForXPath('//div/ul/li');
let docPages = await searchResults[0].$x('//div/ul/li');
await download(browser, /*page,*/ docPages, folder);
}).catch( e => {
console.log('ERROR2');
console.log(e);
});
}
//await zip.zipFolder(down_path + folder, zip_path + folder + '.zip');
}
}
//browser.close();
})();
async function child(){
for(let a = 0; a < 3; a++){
ls[a] = fork('download.js');
}
}
async function download(browser, docPages, folder){
let child_count = 3;
const paged = await new Promise(async (resolve, reject) => {
for(let b = 0; b < docPages.length; b+=child_count){
await send_data(browser, docPages, folder, child_count, b);
console.log("b : b");
}
});
}
async function send_data(browser, docPages, folder, child_count, b){
let c = 0;
await new Promise(async (resolve, reject) => {
for(let a = 0; a < child_count; a++){
let urlR = await docPages[a+b].$x('//article/a');
let url = await (await urlR[0].getProperty('href')).jsonValue();
fs.appendFileSync('progress/page.txt', url + '\n');
ls[a].on('message', (msg) => {
if(msg == 'Executed'){
}if(msg == 'done'){
c++
if(c == (child_count-1)){
resolve({});
}
}
});
ls[a].send([url, folder, a]);
}
});
}
child (download.js)
const puppeteer = require('puppeteer');
//const page = (await browser.pages())[0];
const down_path = 'downloads/';
const error_path = 'errors/';
const fs = require('fs');
const path = require('path');
var log = require('./login');
const login_url = 'https://www.scribd.com/login';
const login_email = '';
const login_password = '';
const t_out = 1000;
const dowload_errorFile = 'errors.txt';
var browser;
var flag = 0;
const t_outforL = 100000;
process.on('message', async (obj)=> {
console.log("CHILD: data received from parent process", obj);
console.log("Process Starting : ", process.pid);
url = obj[0];
loc = obj[1];
ins = obj[2];
if(!flag){
browser = await puppeteer.launch(({
headless: false,
userDataDir: path.join(__dirname,'user_data/'+ins),
}));
const downPage = (await browser.pages())[0];
await downPage.goto('https://www.scribd.com', { waitUntil: 'networkidle2' });
flag = 1;
}
await download(url, loc);
});
process.send('Executed');
async function download(url, folder){
const downPage = (await browser.pages())[0];
await downPage.goto(`${url}`, { waitUntil: 'networkidle2' });
await downPage.waitForXPath('//button[#data-e2e="download-button"]', { timeout: 1000 }).then(async ()=>{
let dwnld = await downPage.$x('//button[#data-e2e="download-button"]');
const client = await downPage.target().createCDPSession()
await client.send('Page.setDownloadBehavior', {
behavior: 'allow',
downloadPath: path.join(__dirname, 'downloads/' + folder),
})
await dwnld[0].click();
await downPage.waitForXPath('//a[#class="wrapper__filled-button download_selection_btn"]');
let clicks = await downPage.$x('//a[#class="wrapper__filled-button download_selection_btn"]');
await clicks[0].click();
await downPage.waitForTimeout(500);
process.send('done');
}).catch( e => {
console.log('No download exists');
console.log(e);
});
await downPage.waitForTimeout(2500);
//process.exit();
}
It runs perfectly fine for a round or 2, but then puppeteer crashes with
Error: net::ERR_ABORTED at https://www.scribd.com/document/456861933
at navigate (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\node_modules\puppeteer-core\lib\cjs\puppeteer\common\Frame.js:238:23)
at process.processTicksAndRejections (node:internal/process/task_queues:95:5)
at async Frame.goto (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\node_modules\puppeteer-core\lib\cjs\puppeteer\common\Frame.js:207:21)
at async CDPPage.goto (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\node_modules\puppeteer-core\lib\cjs\puppeteer\common\Page.js:439:16)
at async download (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\download.js:59:3)
at async process.<anonymous> (D:\Projects\Upwork\Project 6 (Rustom Vazifdar)\code\download.js:53:2)
I want to make a function that gets the number from the console and iterates it with 1 step till the 100 number and stop. Please help to solve the problem! Thanks!
const readline = require("readline");
const rl = readline.createInterface({
input: process.stdin,
output: process.stdout
});
const ask = msg => new Promise(resolve =>
rl.question(msg, response => resolve(response))
);
const simpleInterest = (min) => {
for(min; min < 100; min +1) {
return min
};
};
const main = async () => {
const min = await ask("min number: ");
console.log(simpleInterest(min));
rl.close();
};
main();
Why my command still not getting a cooldown when user used it?
Here's the code:
const cooldown = new Set();
const { MessageEmbed } = require('discord.js');
const eco = require('discord-mongoose-economy');
eco.connect("mongodb://localhost/database"); //this is not the main I hide it
const work = require('../../events/casino/work.json')
if(cooldown.has(message.author.id)) {
message.reply(`You're tired! Wait for 30 seconds to work again!`)
} else {
const total = Object.keys(work).length
var random = Math.floor(Math.random() * total + 1);
var workInfo = work[random];
const min = 10;
const max = 100;
const coin = Math.floor(Math.random() * (max - min + 1) ) + min;
if(message.author.bot) return;
await eco.give(message.author.id, message.guild.id, coin);
const workembed = new MessageEmbed()
.setDescription(workInfo.description + `and recieved ` + coin + ` coins`)
.setColor('RANDOM')
.setFooter({text: `Work #${random}`})
message.channel.send({embeds: [workembed]});
}
cooldown.add(message.author.id);
setTimeout(() => {
cooldown.delete(message.author.id)
}, 30000);
Is there any misplaced codes or something? Because there's still no cooldown, I'm using discord.js v13
I've edited my codes to this:
if(cooldown.has(message.author.id)) {
message.reply(`You're tired! Wait for 30 seconds to work again!`)
} else {
const total = Object.keys(work).length
var random = Math.floor(Math.random() * total + 1);
var workInfo = work[random];
const min = 10;
const max = 100;
const coin = Math.floor(Math.random() * (max - min + 1) ) + min;
if(message.author.bot) return;
await eco.give(message.author.id, message.guild.id, coin);
const workembed = new MessageEmbed()
.setDescription(workInfo.description + `and recieved ` + coin + ` coins`)
.setColor('RANDOM')
.setFooter({text: `Work #${random}`})
message.channel.send({embeds: [workembed]});
cooldown.add(message.author.id);
setTimeout(() => {
cooldown.delete(message.author.id)
}, 30000);
}
Still not working.
Last edit for my command and its finally working, I added a package called humanize-duration here is the final code of my work:
const cooldowns = new Map();
const humanizeDuration = require('humanize-duration');
const cooldown = cooldowns.get(message.author.id);
if(cooldown) {
const remaining = humanizeDuration(cooldown - Date.now(), {units: ['s'], round: true})
const cd = new MessageEmbed()
.setAuthor({iconURL: `${message.author.avatarURL()}`,name: `${message.member.user.tag}`})
.setDescription(`You need to wait ${remaining} to do work again, take some rest!`)
.setColor('RANDOM')
return message.channel.send({embeds: [cd]})
.catch(console.error);
} else {
const total = Object.keys(work).length
var random = Math.floor(Math.random() * total + 1);
var workInfo = work[random];
const min = 80000;
const max = 120000;
const coin = Math.floor(Math.random() * (max - min + 1) ) + min;
if(message.author.bot) return;
await eco.give(message.author.id, message.guild.id, coin);
const workembed = new MessageEmbed()
.setAuthor({iconURL: `${message.author.avatarURL()}`,name: `${message.member.user.tag}`})
.setDescription(workInfo.description + `and recieved ` + coin + ` coins`)
.setColor('RANDOM')
.setFooter({text: `Work #${random}`})
message.channel.send({embeds: [workembed]});
cooldowns.set(message.author.id, Date.now() + 30000);
setTimeout(() => cooldowns.delete(message.author.id), 30000);
}
If someone need this code I hope you guys can get some tips!
I need a node js function that goes through two arrays and waits for user input every time and then continues to loop.
let array = **SOME_ARRAY**;
let array_2 = **SOME_ARRAY**;
for (let i in array) {
for (let j in array_2) {
process.stdin.on('data', data => {
data = data.toString();
data = data.replaceAll('\r', '').replaceAll('\n', '');
console.log(data, array[i], array_2[j]);
});
}
}
You could use readline.question with async-await:
const readline = require('readline');
function readInput() {
const interface = readline.createInterface({
input: process.stdin,
output: process.stdout,
});
return new Promise(resolve => interface.question("Please provide next input: ", answer => {
interface.close();
resolve(answer);
}))
}
(async () => {
const array = [1, 2];
const array2 = [4, 5];
for (const a of array) {
for (const b of array2) {
const data = await readInput();
// Do something with your data...
console.log(data, a, b);
}
}
})();
I am trying to execute the following AWS lambda:
index.js:
const test = require("test");
exports.handler = async (event) => {
await test.main();
};
test.js:
const {Worker} = require("worker_threads");
const main = () => {
let num1 = 10;
let num2 = 20;
const worker = new Worker("./worker.js", {workerData: {num1, num2}});
worker.on("message", result => {
console.log(`${num1} + ${num2} is ${result}`);
});
worker.on("error", error => {
console.log(error);
});
worker.on("exit", exitCode => {
console.log(exitCode);
});
console.log("I am here!");
}
exports.main = main;
worker.js:
const {parentPort, workerData} = require("worker_threads");
const getSum = (num1, num2) => {
return num1 + num2;
}
parentPort.postMessage(getSum(workerData.num1, workerData.num2));
When I run the same program on my laptop it is working fine. I see the output of the worker thread consistently.
Output on my laptop:
❯ node index.js
I am here!
10 + 20 is 30
0
Output on the lambda:
START RequestId: c178d74b-da57-4765-9fa7-77d3fc83d645 Version: $LATEST
2021-08-31T14:33:37.353Z c178d74b-da57-4765-9fa7-77d3fc83d645 INFO I am here!
END RequestId: c178d74b-da57-4765-9fa7-77d3fc83d645
REPORT RequestId: c178d74b-da57-4765-9fa7-77d3fc83d645 Duration: 2.12 ms Billed Duration: 3 ms Memory Size: 10240 MB Max Memory Used: 108 MB
When I run the lambda, the output is very random. Sometimes I see the output of the worker thread and other times I don't.
Why is there a difference in execution of the program on AWS lambda and on my laptop?
You don't await for the worker async operation to complete in the test.js file. Try adding a promise that resolves when worker finishes. Like this:
const { Worker } = require("worker_threads");
const main = async () => {
let num1 = 10;
let num2 = 20;
const worker = new Worker("./worker.js", { workerData: { num1, num2 } });
worker.on("message", (result) => {
console.log(`${num1} + ${num2} is ${result}`);
});
worker.on("error", (error) => {
console.log(error);
});
console.log("I am here!");
// Awaiting for the worker to finish here
return new Promise((resolve) => {
worker.on("exit", (exitCode) => {
console.log(exitCode);
resolve();
});
});
};
exports.main = main;
Alternatively, you can set context.doNotWaitForEmptyEventLoop = false but it's not recommended as it's error-prone and hard to debug.