Node.js: given array of URLs, determine which are valid - node.js

I am a total scrub with the node http module and having some trouble.
The ultimate goal here is to take a huge list of urls, figure out which are valid and then scrape those pages for certain data. So step one is figuring out if a URL is valid and this simple exercise is baffling me.
say we have an array allURLs:
["www.yahoo.com", "www.stackoverflow.com", "www.sdfhksdjfksjdhg.net"]
The goal is to iterate this array, make a get request to each and if a response comes in, add the link to a list of workingURLs (for now just another array), else it goes to a list brokenURLs.
var workingURLs = [];
var brokenURLs = [];
for (var i = 0; i < allURLs.length; i++) {
var url = allURLs[i];
var req = http.get(url, function (res) {
if (res) {
workingURLs.push(?????); // How to derive URL from response?
}
});
req.on('error', function (e) {
brokenURLs.push(e.host);
});
}
what I don't know is how to properly obtain the url from the request/ response object itself, or really how to structure this kind of async code - because again, I am a nodejs scrub :(
For most websites using res.headers.location works, but there are times when the headers do not have this property and that will cause problems for me later on. Also I've tried console logging the response object itself and that was a messy and fruitless endeavor
I have tried pushing the url variable to workingURLs, but by the time any response comes back that would trigger the push, the for loop is already over and url is forever pointing to the final element of the allURLs array.
Thanks to anyone who can help

You need to closure url value to have access to it and protect it from changes on next loop iteration.
For example:
(function(url){
// use url here
})(allUrls[i]);
Most simple solution for this is use forEach instead of for.
allURLs.forEach(function(url){
//....
});
Promisified solution allows you to get a moment when work is done:
var http = require('http');
var allURLs = [
"http://www.yahoo.com/",
"http://www.stackoverflow.com/",
"http://www.sdfhksdjfksjdhg.net/"
];
var workingURLs = [];
var brokenURLs = [];
var promises = allURLs.map(url => validateUrl(url)
.then(res => (res?workingURLs:brokenURLs).push(url)));
Promise.all(promises).then(() => {
console.log(workingURLs, brokenURLs);
});
// ----
function validateUrl(url) {
return new Promise((ok, fail) => {
http.get(url, res => return ok(res.statusCode == 200))
.on('error', e => ok(false));
});
}
// Prevent nodejs from exit, don't need if any server listen.
var t = setTimeout(() => { console.log('Time is over'); }, 1000).ref();

You can use something like this (Not tested):
const arr = ["", "/a", "", ""];
Promise.all(arr.map(fetch)
.then(responses=>responses.filter(res=> res.ok).map(res=>res.url))
.then(workingUrls=>{
console.log(workingUrls);
console.log(arr.filter(url=> workingUrls.indexOf(url) == -1 ))
});
EDITED
Working fiddle (Note that you can't do request to another site in the browser because of Cross domain).
UPDATED with #vp_arth suggestions
const arr = ["/", "/a", "/", "/"];
let working=[], notWorking=[],
find = url=> fetch(url)
.then(res=> res.ok ?
working.push(res.url) && res : notWorking.push(res.url) && res);
Promise.all(arr.map(find))
.then(responses=>{
console.log('woking', working, 'notWorking', notWorking);
/* Do whatever with the responses if needed */
});
Fiddle

Related

How to loop many http requests with axios in node.js

I have an array of users where each user has an IP address.
I have an API that I send an IP as a request and it returns a county code that belongs to this IP.
In order to get a country code to each user I need to send separate request to each user.
In my code I do async await but it takes about 10 seconds until I get all the responses, if I don't do the async await, I don’t get the country codes at all.
My code:
async function getAllusers() {
let allUsersData = await usersDao.getAllusers();
for (let i = 0; i < allUsersData.length; i++) {
let data = { ip: allUsersData[i].ip };
let body = new URLSearchParams(data);
await axios
.post("http://myAPI", body)
.then((res) => {
allUsersData[i].countryCode = res.data.countryCode;
});
}
return allUsersData;
}
You can use Promise.all to make all your requests once instead of making them one by one.
let requests = [];
for (let i = 0; i < allUsersData.length; i++) {
let data = { ip: allUsersData[i].ip };
let body = new URLSearchParams(data);
requests.push(axios.post("http://myAPI", body)); // axios.post returns a Promise
}
try {
const results = await Promise.all(requests);
// results now contains each request result in the same order
// Your logic here...
}
catch (e) {
// Handles errors
}
If you're just trying to get all the results faster, you can request them in parallel and know when they are all done with Promise.all():
async function getAllusers() {
let allUsersData = await usersDao.getAllusers();
await Promise.all(allUsersData.map((userData, index) => {
let body = new URLSearchParams({ip: userData.ip});
return axios.post("http://myAPI", body).then((res) => {
allUsersData[index].countryCode = res.data.countryCode;
});
}));
return allUsersData;
}
Note, I would not recommend doing it this way if the allUsersData array is large (like more than 20 long) because you'll be raining a lot of requests on the target server and it may either impeded its performance or you may get rate limited or even refused service. In that case, you'd need to send N requests at a time (like perhaps 5) using code like this pMap() here or mapConcurrent() here.

various axios call relative at previous one

i have an array of variable number of urls and i must merge the data get with axion
the problem is then every axios call is relative to the data of the previus
if i have a fixed number of ulrs i can nest axion calls and live with that
i think to use something like this
var urls = ["xx", "xx", "xx"];
mergeData(urls);
function mergeData(myarray, myid = 0, mydata = "none") {
var myurl = "";
if (Array.isArray(mydata)) {
myurl = myarray[myid];
// do my stuff with data and modify the url
} else {
myurl = myarray[myid];
}
axios.get(myurl)
.then(response => {
// do my stuff and get the data i need and put on an array
if (myarray.length < myid) {
mergeData(myarray, myid + 1, data);
} else {
// show result on ui
}
})
.catch(error => {
console.log(error);
});
}
but i dont like it
there is another solution?
(be kind, i'm still learning ^^)
just to be clear
i need to optain
http request to "first url", parse the the json, save some data(some needed for the output)
another http request to "second url" with one or more parameter from previous data, parse the the json, save some data(some needed for the output)
... and so on, for 5 to 10 times
If your goal is to make subsequent HTTP calls based on information you get from previous calls, I'd utilize async/await and for...of to accomplish this instead of relying on a recursive solution.
async function mergeData(urls) {
const data = [];
for (const url of urls) {
const result = await axios.get(url).then(res => res.data);
console.log(`[${result.id}] ${result.title}`);
// here, do whatever you want to do with
// `result` to make your next call...
// for now, I am just going to append each
// item to `data` and return it at the end
data.push(result);
}
return data;
}
const items = [
"https://jsonplaceholder.typicode.com/posts/1",
"https://jsonplaceholder.typicode.com/posts/2",
"https://jsonplaceholder.typicode.com/posts/3"
];
console.log("fetching...")
mergeData(items)
.then(function(result) {
console.log("done!")
console.log("final result", result);
})
.catch(function(error) {
console.error(error);
});
<script src="https://cdnjs.cloudflare.com/ajax/libs/axios/0.19.2/axios.min.js"></script>
Using async/await allows you to utilize for...of which will wait for each call to resolve or reject before moving onto the next one.
To learn more about async/await and for...of, have a look here:
developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/async_function
developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/for...of
Hope this helps.

Having difficulties with node.js res.send() loop

I'm attempting to write a very basic scraper that loops through a few pages and outputs all the data from each url to a single json file. The url structure goes as follows:
http://url/1
http://url/2
http://url/n
Each of the urls has a table, which contains information pertaining to the ID of the url. This is the data I am attempting to retrieve and store inside a json file.
I am still extremely new to this and having a difficult time moving forward. So far, my code looks as follows:
app.get('/scrape', function(req, res){
var json;
for (var i = 1163; i < 1166; i++){
url = 'https://urlgoeshere.com' + i;
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var mN, mL, iD;
var json = { mN : "", mL : "", iD: ""};
$('html body div#wrap h2').filter(function(){
var data = $(this);
mN = data.text();
json.mN = mN;
})
$('table.vertical-table:nth-child(7)').filter(function(){
var data = $(this);
mL = data.text();
json.mL = mL;
})
$('table.vertical-table:nth-child(8)').filter(function(){
var data = $(this);
iD = data.text();
json.iD = iD;
})
}
fs.writeFile('output' + i + '.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
})
});
}
res.send(json);
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
When I run the code as displayed above, the output within the output.json file only contains data for the last url. I presume that's because I attempt to save all the data within the same variable?
If I include res.send() inside the loop, so the data writes after each page, I receive the error that multiple headers cannot be sent.
Can someone provide some pointers as to what I'm doing wrong? Thanks in advance.
Ideal output I would like to see:
Page ID: 1
Page Name: First Page
Color: Blue
Page ID: 2
Page Name: Second Page
Color: Red
Page ID: n
Page Name: Nth Page
Color: Green
I can see a number of problems:
Your loop doesn't wait for the asynchronous operations in the loop, thus you do some things like res.send() before the asynchronous operations in the loop have completed.
In appropriate use of cheerio's .filter().
Your json variable is constantly being overwritten so it only has the last data in it.
Your loop variable i would lose its value by the time you tried to use it in the fs.writeFile() statement.
Here's one way to deal with those issues:
const rp = require('request-promise');
const fsp = require('fs').promises;
app.get('/scrape', async function(req, res) {
let data = [];
for (let i = 1163; i < 1166; i++) {
const url = 'https://urlgoeshere.com/' + i;
try {
const html = await rp(url)
const $ = cheerio.load(html);
const mN = $('html body div#wrap h2').first().text();
const mL = $('table.vertical-table:nth-child(7)').first().text();
const iD = $('table.vertical-table:nth-child(8)').first().text();
// create object for this iteration of the loop
const obj = {iD, mN, mL};
// add this object to our overall array of all the data
data.push(obj);
// write a file specifically for this invocation of the loop
await fsp.writeFile('output' + i + '.json', JSON.stringify(obj, null, 4));
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
} catch(e) {
// stop further processing on an error
console.log("Error scraping ", url, e);
res.sendStatus(500);
return;
}
}
// send all the data we accumulated (in an array) as the final result
res.send(data);
});
Things different in this code:
Switch over all variable declarations to let or const
Declare route handler as async so we can use await inside.
Use the request-promise module instead of request. It has the same features, but returns a promise instead of using a plain callback.
Use the promise-based fs module (in latest versions of node.js).
Use await in order to serialize our two asynchronous (now promise-returning) operations so the for loop will pause for them and we can have proper sequencing.
Catch errors and stop further processing and return an error status.
Accumulate an object of data for each iteration of the for loop into an array.
Change .filter() to .first().
Make the response to the request handler be a JSON array of data.
FYI, you can tweak the organization of the data in obj however you want, but the point here is that you end up with an array of objects, one for each iteration of the for loop.
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.

Promise issue with nested loops

I'm trying to write a server in node, and I am taking data from a SQL database, I have another function that does that, but inside this function I have that list of data (thisData). In my server I need to query another server (offsite) to get a status update, update objects in the list based on this, and then respond with the JSON data.
I've been working on this for a long time and tried to find solutions based on other answers on here and I've yet to find something. I was originally trying to do this with just callbacks, but now I'm trying to do it with Promise.all()
I have my object thisData, it is a list of three "areas" which each contain a list of objects.
Both of these code snippets are inside a standard listener function for the standard http lib:
var server = http.createServer((request, response => {...
Immediately inside this function I also have a call to get the data:
getData(function(Data){
thisData = JSON.parse(data);
all the code snippets are inside this getData which is inside my listener function.
I also have a function that returns a promise to do the async web query:
var statusRequest = function(myArea, obj){
var url = 'https://www.example.com/data/' + obj.id + '/mode.json';
var options = {
method: 'GET',
uri: url,
strictSSL: false,
headers: {
'Authorization': config.auth_key,
'Accept': "application/json",
}
}
return new Promise((resolve, reject) => {
request(options, function (err, body, resp) {
var status = JSON.parse(resp).mode;
if (status == "success") thisData[area][obj.col].status = 0;
else if (status == "fail") thisData[area][obj.col].status = 1;
else thisData[area][obj.col].status = -1;
resolve(thisData[area][obj.col].status);
});
});
}
Then I have a loop that traverses the areas and objects, and I add each promise to a list and call Promise.all(), then I respond with the updated list:
var promises = [];
Object.keys(thisData).forEach(function(area){
for (var col = 0; col < thisData[area].length; col++){
promises.push(statusRequest(area, thisData[area][col]));
}
});
Promise.all(promises).then( (val) => {
console.log(val);
response.end(JSON.stringify(thisData));
});
I was initially having problems with all the promises resolving when col == thisData[area].length, and so I passed the actual object (which contains its own column information) into the statusRequest function so it shouldn't matter if the col variable changes.
The problem I'm having is that my console prints the correct status, but my JSON response does not contain the correct status, in fact it seems as though none of them are updating.
I would really appreciate any help, thanks in advance!

calling async function recursively in node.js

I am new to node.js.Learning async library,currently I have array of urls.For each url,I have to make a request to some website and from that website I will get its hyperlinks from html page.So we I have to make a call recursively with request.js module of Node.js.
var urls=["http://www.a.com","http://www.b.com"];
function getUrls(url,cb){
request(url,function(error,response,body){
if(response && response.statusCode==200)
{
}
cb();
});
}
function startProcess(urls){
async.map(urls,getUrls,function(error,data){
})
}
startProcess(urls);
In getUrls function,I have called request function each time for each url.When I am getting html page data from response.I am scraping urls from that page also..I want to call "request" function for those urls also recursively.
Can it be done without async.map function ?
You can't. You will get stack overflow after some iteration. What you should do to solve this task, is to maintain queue of urls, which you want to scrape. Then on each successful response add new url into this queue. Something like this:
let queue = ['https://some.start.point.net'];
const concurrency = 5;
let activeThreads = 0
async function scraper(url) {
activeThreads++;
const body = await request.get(url);
const urls = // parse body here
for (const url of urls) queue.push(url);
activeThreads--;
}
setInterval(() => {
if (activeThreads < concurrency && queue.length) scraper(queue.shift());
}, 10)

Resources