calling async function recursively in node.js - node.js

I am new to node.js.Learning async library,currently I have array of urls.For each url,I have to make a request to some website and from that website I will get its hyperlinks from html page.So we I have to make a call recursively with request.js module of Node.js.
var urls=["http://www.a.com","http://www.b.com"];
function getUrls(url,cb){
request(url,function(error,response,body){
if(response && response.statusCode==200)
{
}
cb();
});
}
function startProcess(urls){
async.map(urls,getUrls,function(error,data){
})
}
startProcess(urls);
In getUrls function,I have called request function each time for each url.When I am getting html page data from response.I am scraping urls from that page also..I want to call "request" function for those urls also recursively.
Can it be done without async.map function ?

You can't. You will get stack overflow after some iteration. What you should do to solve this task, is to maintain queue of urls, which you want to scrape. Then on each successful response add new url into this queue. Something like this:
let queue = ['https://some.start.point.net'];
const concurrency = 5;
let activeThreads = 0
async function scraper(url) {
activeThreads++;
const body = await request.get(url);
const urls = // parse body here
for (const url of urls) queue.push(url);
activeThreads--;
}
setInterval(() => {
if (activeThreads < concurrency && queue.length) scraper(queue.shift());
}, 10)

Related

How do I pass parameters to Apify BasicCrawler handleRequestFunction?

I'm trying to migrate an existing function to use it inside an Apify actor.
Originally, the function loads a given URL, reads its JSON response, and according to some supplied parameters, extracts some data and returns an object with results.
If you ask, it's not scraping anything "final" at this point. Its results are temporary and will be used to create other URLs which will be scraped then (with another crawler) for actual, useful results.
The current function that executes the crawler is something like this:
let url = new URL('/content', someBaseURL);
url.searchParams.set('search', someKeyword);
const reqList = new apify.RequestList({
sources: [ { url: url.toString() } ]
});
await reqList.initialize();
const crawler = new apify.BasicCrawler({
requestList: reqList,
handleRequestFunction: reqHandler
});
// How do I set the inputs for reqHandler() here ?
await crawler.run();
// How do I get the output from reqHandler() here ?
And the reqHandler code is something like this:
async function reqHandler(options) {
const response = await apify.utils.requestAsBrowser({
url: options.request.url
});
// How do I read parameters from the caller here ?
let searchResults = JSON.parse(response.body);
// ... result object creation logic goes here ...
// How do I return a result to the caller here ?
}
I am pretty new to this Apify thing and lost in the documentation.
Thanks for your help.
handleRequestFunction doesn't take any external input or produce any outputs. Simply use it as a closure and capture inputs from the surrounding code or you can wrap it in a different function.
Normally we do it like this:
const context = {}; // put your inputs here
const crawler = new apify.BasicCrawler({
requestList: reqList,
handleRequestFunction: async () => {
// use context here
// output data
await Apify.pushData(results);
}
});
EDIT: I forgot to mention a use-case on how to pass input. You need to do it via the request.userData object when adding to a queue or a list.
// The same userData is available in request list.
await requestQueue.addRequest({
url: 'https://example.com',
userData: { myInput: 'any-data' }
});
// Then in handleRequestFunction
handleRequestFunction: async (( request }) => {
const { myInput } = request.userData;
// ...
}

How to loop many http requests with axios in node.js

I have an array of users where each user has an IP address.
I have an API that I send an IP as a request and it returns a county code that belongs to this IP.
In order to get a country code to each user I need to send separate request to each user.
In my code I do async await but it takes about 10 seconds until I get all the responses, if I don't do the async await, I don’t get the country codes at all.
My code:
async function getAllusers() {
let allUsersData = await usersDao.getAllusers();
for (let i = 0; i < allUsersData.length; i++) {
let data = { ip: allUsersData[i].ip };
let body = new URLSearchParams(data);
await axios
.post("http://myAPI", body)
.then((res) => {
allUsersData[i].countryCode = res.data.countryCode;
});
}
return allUsersData;
}
You can use Promise.all to make all your requests once instead of making them one by one.
let requests = [];
for (let i = 0; i < allUsersData.length; i++) {
let data = { ip: allUsersData[i].ip };
let body = new URLSearchParams(data);
requests.push(axios.post("http://myAPI", body)); // axios.post returns a Promise
}
try {
const results = await Promise.all(requests);
// results now contains each request result in the same order
// Your logic here...
}
catch (e) {
// Handles errors
}
If you're just trying to get all the results faster, you can request them in parallel and know when they are all done with Promise.all():
async function getAllusers() {
let allUsersData = await usersDao.getAllusers();
await Promise.all(allUsersData.map((userData, index) => {
let body = new URLSearchParams({ip: userData.ip});
return axios.post("http://myAPI", body).then((res) => {
allUsersData[index].countryCode = res.data.countryCode;
});
}));
return allUsersData;
}
Note, I would not recommend doing it this way if the allUsersData array is large (like more than 20 long) because you'll be raining a lot of requests on the target server and it may either impeded its performance or you may get rate limited or even refused service. In that case, you'd need to send N requests at a time (like perhaps 5) using code like this pMap() here or mapConcurrent() here.

Using global variables in node js in different async functions

I'm trying to get more familiar with best practices in NodeJS. Currently, I have a asynchronous function that scrapes some data off a website and stores the values retrieved in an object. What I would like to do, is use the value in a different function to extract data from Yahoo Finance to retrieve specific values. I am unsure how to pass in this value to the other functions. I'm thinking possibly setting the value, that is passed in to to other functions, as a global variable. Would that be best practice in the NodeJS world of programming? Any opinions or advice would helpful. Below is the code I currently have:
const cheerio = require('cheerio');
const axios = require("axios");
async function read_fortune_500() {
try {
const { data } = await axios({ method: "GET", url: "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies", })
const $ = cheerio.load(data)
const elemSelector = '#constituents > tbody > tr > td:nth-child(1)'
const keys = ['symbol']
$(elemSelector).each((parentIndex, parentElem) => {
let keyIndex = 0
const stockObject = {}
if (parentIndex <= 9){
$(parentElem).children().each((childIndex, childElem) => {
const tdValue = $(childElem).text()
if (tdValue) {
stockObject[keys[keyIndex]] = tdValue
}
})
console.log(stockObject)
}
})
} catch (err) {
console.error(err)
}
}
async function getCurrentPrice() {}
read_fortune_500()
Sounds more like a JavaScript question then a NodeJS specific question.
NodeJS: I would say you can store the result of scraping the website in session data. Or pass it along in the response and call next(). Or create some middleware to scrape the website before calling the Yahoo route.
Javascript: You can call a async function to scrape the data on your site and await the response. once it is done you can call your next function passing the data retrieved from async result. See below for basic example.
async function scrapeWebsite(){
let webScrapeReults;
// logic to scrape site
return webScrapeReults;
}
function getYahooMarket(){
let results;
let webData = await scrapeWebsite();
// use webData to get reults for yahooMarket
return results;
}

Node js sequential call for 50 times with each request dependent on data of previous request using for loop

I want to write code in node js such that it should make 50 sequential api calls where each new api request requires data from previous request
I have tried for loop but failed to pass the previous data in next api call
can you suggest what should i do for efficient code?
Thank you in advance...!!!
You can use the async / await syntax here, this is the most appropriate way of looping over asynchronous calls.
Once we receive the result of each call we can save into an array of all calls.
It is then very straightforward to pass the last call response to the next call.
I've decided to use axios here, since request is now deprecated.
Here's an example, I'm using the handy https://jsonplaceholder.typicode.com site to test this:
const axios = require("axios");
async function callApiRepeatedly(callCount = 1) {
let callResults = [];
for (let callIndex = 0; callIndex < callCount; callIndex++) {
let previousCallResult = (callIndex === 0) ? null: callResults[callResults.length - 1];
console.log(`Calling api #${callIndex + 1} of ${callCount}...`);
let callResult = await callApi(callIndex, previousCallResult);
callResults.push(callResult);
}
console.log("callResults: Returning results array:", callResults);
return callResults;
}
async function callApi(index, previousCallData) {
// We can use the previousCallData here if we wish..
console.log(`CallApi: Call #${index+1}, previous call data:`, previousCallData);
const url = `https://jsonplaceholder.typicode.com/posts/${index+1}`;
let response = await axios({ url });
return response.data
}
// Set this as appropriate, I'm setting to 5 just for test purposes.
// You would be setting to 50 I guess!
const callCount = 5;
callApiRepeatedly(callCount);

Node.js: given array of URLs, determine which are valid

I am a total scrub with the node http module and having some trouble.
The ultimate goal here is to take a huge list of urls, figure out which are valid and then scrape those pages for certain data. So step one is figuring out if a URL is valid and this simple exercise is baffling me.
say we have an array allURLs:
["www.yahoo.com", "www.stackoverflow.com", "www.sdfhksdjfksjdhg.net"]
The goal is to iterate this array, make a get request to each and if a response comes in, add the link to a list of workingURLs (for now just another array), else it goes to a list brokenURLs.
var workingURLs = [];
var brokenURLs = [];
for (var i = 0; i < allURLs.length; i++) {
var url = allURLs[i];
var req = http.get(url, function (res) {
if (res) {
workingURLs.push(?????); // How to derive URL from response?
}
});
req.on('error', function (e) {
brokenURLs.push(e.host);
});
}
what I don't know is how to properly obtain the url from the request/ response object itself, or really how to structure this kind of async code - because again, I am a nodejs scrub :(
For most websites using res.headers.location works, but there are times when the headers do not have this property and that will cause problems for me later on. Also I've tried console logging the response object itself and that was a messy and fruitless endeavor
I have tried pushing the url variable to workingURLs, but by the time any response comes back that would trigger the push, the for loop is already over and url is forever pointing to the final element of the allURLs array.
Thanks to anyone who can help
You need to closure url value to have access to it and protect it from changes on next loop iteration.
For example:
(function(url){
// use url here
})(allUrls[i]);
Most simple solution for this is use forEach instead of for.
allURLs.forEach(function(url){
//....
});
Promisified solution allows you to get a moment when work is done:
var http = require('http');
var allURLs = [
"http://www.yahoo.com/",
"http://www.stackoverflow.com/",
"http://www.sdfhksdjfksjdhg.net/"
];
var workingURLs = [];
var brokenURLs = [];
var promises = allURLs.map(url => validateUrl(url)
.then(res => (res?workingURLs:brokenURLs).push(url)));
Promise.all(promises).then(() => {
console.log(workingURLs, brokenURLs);
});
// ----
function validateUrl(url) {
return new Promise((ok, fail) => {
http.get(url, res => return ok(res.statusCode == 200))
.on('error', e => ok(false));
});
}
// Prevent nodejs from exit, don't need if any server listen.
var t = setTimeout(() => { console.log('Time is over'); }, 1000).ref();
You can use something like this (Not tested):
const arr = ["", "/a", "", ""];
Promise.all(arr.map(fetch)
.then(responses=>responses.filter(res=> res.ok).map(res=>res.url))
.then(workingUrls=>{
console.log(workingUrls);
console.log(arr.filter(url=> workingUrls.indexOf(url) == -1 ))
});
EDITED
Working fiddle (Note that you can't do request to another site in the browser because of Cross domain).
UPDATED with #vp_arth suggestions
const arr = ["/", "/a", "/", "/"];
let working=[], notWorking=[],
find = url=> fetch(url)
.then(res=> res.ok ?
working.push(res.url) && res : notWorking.push(res.url) && res);
Promise.all(arr.map(find))
.then(responses=>{
console.log('woking', working, 'notWorking', notWorking);
/* Do whatever with the responses if needed */
});
Fiddle

Resources