Test random URLs from spreadsheet using alasql - node.js

I have a large number of URLs within a xlsx file. What I'd like to do is randomly select some of these URLs, load them, then check that they return a status code of 200.
So I'm using the npm alasql package to do this.
At the moment, the following code successfully loads the first 5 URLs in the spreadsheet, checks that they 200, then finishes the test.
var alasql = require('alasql');
var axios = require('axios');
module.exports = {
'#tags': ['smokeTest'],
'Site map XML pages load': async (browser) => {
const result = await alasql.promise('select URL from xlsx("./testUrls.xlsx",{sheetid:"RandomUrls"})');
var xcelData = result.map(item => {
return item.URL;
});
async function siteMapUrlsTestArr(item) {
var response = await axios.get(browser.launch_url + item);
browser.verify.equal(response.status, 200);
console.log('Sitemap test URL =', (browser.launch_url + item));
}
for (let index = 0; index < xcelData.length; index++) {
if (index < 5) {
const xmlTestUrl = xcelData[index];
await siteMapUrlsTestArr(xmlTestUrl);
} else {}
}
},
'Closing the browser': function (browser) {
browser.browserEnd();
},
};
However, what I'd like to do is randomly select 5 URLs from the (large) list of URLs in the spreadsheet, rather than the first 5 URLs.
I appreciate that this will (probably) include using the Math.floor(Math.random() command, but I can't seem to get it to work no matter where I place this command.
Any help would be greatly appreciated. Thanks.

Your logic is flawed. Here's how.
You want to select 5 random URLs from a list and then, perform the operation on the items but what you're doing is you're getting all the items and running the operation using a loop on first five.
To correct it:
//Fixed to five as you only want to test 5 URLs.
for (let index = 0; index < 5; index++) {
//Selecting a Random item from the list using Math.random();
const xmlTestUrl = xcelData[Math.floor(Math.random() * xcelData.length)];
//Performing the HTTP response operation on it.
await siteMapUrlsTestArr(xmlTestUrl);
}
The aforementioned solution will select a random item in each loop and perform the HTTP response operation on it. The items will be randomly selected using Math.random().

Related

Wait for a function to create modified Array

I'm writing React app. After clicking one button, I want the file to be downloaded. Before that, the array that I have has to be modified in order to have the downloaded report in proper format.
The problem I have is that I don't know how to force getReports() to wait for setInOrder() to process the data. Therefore code doesn't enter the loop.
export const setInOrder = async (objects) => {
var sortedObjectsAll = new Object();
for (let i = 0; i < objects.length; ++i) {
if (sortedObjectsAll.hasOwnProperty(objects[i].addedBy)) {
sortedObjectsAll[objects[i].addedBy].push(objects[i]);
} else {
sortedObjectsAll[objects[i].addedBy] = new Array();
}
}
return sortedObjectsAll
}
export const getReports = async (objects) => {
const sortedObjectsAll = await setInOrder(objects) // This is correct but not available instantly
console.log(sortedObjectsAll) // this is correctly printed
const reports = new Array();
for (let j = 0; j < sortedObjectsAll.length; ++j) {
console.log("Never enters here")
reports.push(createReport(sortedObjectsAll[j]))
}
return reports
}
I'm trying to use await or async somehow, but can't solve it. I see some Promises advised but I don't know how to really return the resulting variable to the code that actually downloads the report.
First you do not need to write an async-await something like that, because it is not an async operation (and if you write one and do not have any await in it, it will wait for nothing).
Second you want to iterate through an object, and not through an array, and that is the problem. Replace with the following (there are other solutions as well):
for (const key in sortedObjectsAll) {
...
}

How to get element from multiple URLs appending each one?

I have a website that has a main URL containing several links. I want to get the first <p> element from each link on that main page.
I have the following code that works fine to get the desired links from main page and stores them in urls array. But my issue is
that I don't know how to make a loop to load each url from urls array and print each first <p> in each iteration or append them
in a variable and print all at the end.
How can I do this? thanks
var request = require('request');
var cheerio = require('cheerio');
var main_url = 'http://www.someurl.com';
request(main_url, function(err, resp, body){
$ = cheerio.load(body);
links = $('a'); //get all hyperlinks from main URL
var urls = [];
//With this part I get the links (URLs) that I want to scrape.
$(links).each(function(i, link){
lnk = 'http://www.someurl.com/files/' + $(link).attr('href');
urls.push(lnk);
});
//In this part I don't know how to make a loop to load each url within urls array and get first <p>
for (i = 0; i < urls.length; i++) {
var p = $("p:first") //first <p> element
console.log(p.html());
}
});
if you can successfully get the URLs from the first <p>, you already know everything to do that so I suppose you have issues with the way request is working and in particular with the callback based workflow.
My suggestion is to drop request since it's deprecated. You can use something like got which is Promise based so you can use the newer async/await features coming with it (which usually means easier workflow) (Though, you need to use at least nodejs 8 then!).
Your loop would look like this:
for (const i = 0; i < urls.length; i++) {
const source = await got(urls[i]);
// Do your cheerio determination
console.log(new_p.html());
}
Mind you, that your function signature needs to be adjusted. In your case you didn't specify a function at all so the module's function signature is used which means you can't use await. So write a function for that:
async function pullAllUrls() {
const mainSource = await got(main_url);
...
}
If you don't want to use async/await you could work with some promise reductions but that's rather cumbersome in my opinion. Then rather go back to promises and use a workflow library like async to help you manage the URL fetching.
A real example with async/await:
In a real life example, I'd create a function to fetch the source of the page I'd like to fetch, like so (don't forget to add got to your script/package.json):
async function getSourceFromUrl(thatUrl) {
const response = await got(thatUrl);
return response.body;
}
Then you have a workflow logic to get all those links in the other page. I implemented it like this:
async function grabLinksFromUrl(thatUrl) {
const mainSource = await getSourceFromUrl(thatUrl);
const $ = cheerio.load(mainSource);
const hrefs = [];
$('ul.menu__main-list').each((i, content) => {
$('li a', content).each((idx, inner) => {
const wantedUrl = $(inner).attr('href');
hrefs.push(wantedUrl);
});
}).get();
return hrefs;
}
I decided that I'd like to get the links in the <nav> element which are usually wrapped inside <ul> and elements of <li>. So we just take those.
Then you need a workflow to work with those links. This is where the for loop is. I decided that I wanted the title of each page.
async function mainFlow() {
const urls = await grabLinksFromUrl('https://netzpolitik.org/');
for (const url of urls) {
const source = await getSourceFromUrl(url);
const $ = cheerio.load(source);
// Netpolitik has two <title> in their <head>
const title = $('head > title').first().text();
console.log(`${title} (${url}) has source of ${source.length} size`);
// TODO: More work in here
}
}
And finally, you need to call that workflow function:
return mainFlow();
The result you see on your screen should look like this:
Dossiers & Recherchen (https://netzpolitik.org/dossiers-recherchen/) has source of 413853 size
Der Netzpolitik-Podcast (https://netzpolitik.org/podcast/) has source of 333354 size
14 Tage (https://netzpolitik.org/14-tage/) has source of 402312 size
Official Netzpolitik Shop (https://netzpolitik.merchcowboy.com/) has source of 47825 size
Über uns (https://netzpolitik.org/ueber-uns/#transparenz) has source of 308068 size
Über uns (https://netzpolitik.org/ueber-uns) has source of 308068 size
netzpolitik.org-Newsletter (https://netzpolitik.org/newsletter) has source of 291133 size
netzwerk (https://netzpolitik.org/netzwerk/?via=nav) has source of 299694 size
Spenden für netzpolitik.org (https://netzpolitik.org/spenden/?via=nav) has source of 296190 size

Having difficulties with node.js res.send() loop

I'm attempting to write a very basic scraper that loops through a few pages and outputs all the data from each url to a single json file. The url structure goes as follows:
http://url/1
http://url/2
http://url/n
Each of the urls has a table, which contains information pertaining to the ID of the url. This is the data I am attempting to retrieve and store inside a json file.
I am still extremely new to this and having a difficult time moving forward. So far, my code looks as follows:
app.get('/scrape', function(req, res){
var json;
for (var i = 1163; i < 1166; i++){
url = 'https://urlgoeshere.com' + i;
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var mN, mL, iD;
var json = { mN : "", mL : "", iD: ""};
$('html body div#wrap h2').filter(function(){
var data = $(this);
mN = data.text();
json.mN = mN;
})
$('table.vertical-table:nth-child(7)').filter(function(){
var data = $(this);
mL = data.text();
json.mL = mL;
})
$('table.vertical-table:nth-child(8)').filter(function(){
var data = $(this);
iD = data.text();
json.iD = iD;
})
}
fs.writeFile('output' + i + '.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
})
});
}
res.send(json);
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
When I run the code as displayed above, the output within the output.json file only contains data for the last url. I presume that's because I attempt to save all the data within the same variable?
If I include res.send() inside the loop, so the data writes after each page, I receive the error that multiple headers cannot be sent.
Can someone provide some pointers as to what I'm doing wrong? Thanks in advance.
Ideal output I would like to see:
Page ID: 1
Page Name: First Page
Color: Blue
Page ID: 2
Page Name: Second Page
Color: Red
Page ID: n
Page Name: Nth Page
Color: Green
I can see a number of problems:
Your loop doesn't wait for the asynchronous operations in the loop, thus you do some things like res.send() before the asynchronous operations in the loop have completed.
In appropriate use of cheerio's .filter().
Your json variable is constantly being overwritten so it only has the last data in it.
Your loop variable i would lose its value by the time you tried to use it in the fs.writeFile() statement.
Here's one way to deal with those issues:
const rp = require('request-promise');
const fsp = require('fs').promises;
app.get('/scrape', async function(req, res) {
let data = [];
for (let i = 1163; i < 1166; i++) {
const url = 'https://urlgoeshere.com/' + i;
try {
const html = await rp(url)
const $ = cheerio.load(html);
const mN = $('html body div#wrap h2').first().text();
const mL = $('table.vertical-table:nth-child(7)').first().text();
const iD = $('table.vertical-table:nth-child(8)').first().text();
// create object for this iteration of the loop
const obj = {iD, mN, mL};
// add this object to our overall array of all the data
data.push(obj);
// write a file specifically for this invocation of the loop
await fsp.writeFile('output' + i + '.json', JSON.stringify(obj, null, 4));
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
} catch(e) {
// stop further processing on an error
console.log("Error scraping ", url, e);
res.sendStatus(500);
return;
}
}
// send all the data we accumulated (in an array) as the final result
res.send(data);
});
Things different in this code:
Switch over all variable declarations to let or const
Declare route handler as async so we can use await inside.
Use the request-promise module instead of request. It has the same features, but returns a promise instead of using a plain callback.
Use the promise-based fs module (in latest versions of node.js).
Use await in order to serialize our two asynchronous (now promise-returning) operations so the for loop will pause for them and we can have proper sequencing.
Catch errors and stop further processing and return an error status.
Accumulate an object of data for each iteration of the for loop into an array.
Change .filter() to .first().
Make the response to the request handler be a JSON array of data.
FYI, you can tweak the organization of the data in obj however you want, but the point here is that you end up with an array of objects, one for each iteration of the for loop.
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.

PDF.js - split pdf into pages and re-build multiple files

I am currently working on a Node.js project. One of the actions required is to read the text of a pdf document and then split the document into separate files.
As I have been using pdf.js for all other pdf parsing in this project, I was hoping to complete the above requirement using it as well.
Reading the PDF and its text content is relatively straightforward.
For example -
function GetWords(pdfUrl){
var pdf = PDFJS.getDocument(pdfUrl);
return pdf.then(function(pdf) { // calculate total count for document
var maxPages = pdf.pdfInfo.numPages;
var countPromises = []; // collecting all page promises
for (var j = 1; j <= maxPages; j++) {
var page = pdf.getPage(j);
var txt = "";
countPromises.push(page.then(function(page) { // add page promise
var textContent = page.getTextContent();
return textContent.then
(
function(page)
{ // return content promise
for(var i=0;i<page.items.length;i++)
{
var txtadd = page.items[i].str
txt += txtadd.replace(/[^a-zA-Z0-9:;,.?!-() ]/g,'');
}
return txt.split(" ").length; // value for page words
});
}));
}
// Wait for all pages and sum counts
return Promise.all(countPromises).then(function (counts) {
var count = 0;
//counts.forEach(function (c) { count += c; });
return count;
});
});
}
However, I can't seem to find any examples of building a PDF from one / or more of its pages. Ideally, I would want to use the pdf.GetPage(j) to get an array of the pages required. Then push these into a new document and save this new document to disk.
Any help would be appreciated.
I ended up using a separate library to perform the splitting. http://pdfhummus.com/. So in combination with the PDF.js i was able to get the desired result.

Node.js: given array of URLs, determine which are valid

I am a total scrub with the node http module and having some trouble.
The ultimate goal here is to take a huge list of urls, figure out which are valid and then scrape those pages for certain data. So step one is figuring out if a URL is valid and this simple exercise is baffling me.
say we have an array allURLs:
["www.yahoo.com", "www.stackoverflow.com", "www.sdfhksdjfksjdhg.net"]
The goal is to iterate this array, make a get request to each and if a response comes in, add the link to a list of workingURLs (for now just another array), else it goes to a list brokenURLs.
var workingURLs = [];
var brokenURLs = [];
for (var i = 0; i < allURLs.length; i++) {
var url = allURLs[i];
var req = http.get(url, function (res) {
if (res) {
workingURLs.push(?????); // How to derive URL from response?
}
});
req.on('error', function (e) {
brokenURLs.push(e.host);
});
}
what I don't know is how to properly obtain the url from the request/ response object itself, or really how to structure this kind of async code - because again, I am a nodejs scrub :(
For most websites using res.headers.location works, but there are times when the headers do not have this property and that will cause problems for me later on. Also I've tried console logging the response object itself and that was a messy and fruitless endeavor
I have tried pushing the url variable to workingURLs, but by the time any response comes back that would trigger the push, the for loop is already over and url is forever pointing to the final element of the allURLs array.
Thanks to anyone who can help
You need to closure url value to have access to it and protect it from changes on next loop iteration.
For example:
(function(url){
// use url here
})(allUrls[i]);
Most simple solution for this is use forEach instead of for.
allURLs.forEach(function(url){
//....
});
Promisified solution allows you to get a moment when work is done:
var http = require('http');
var allURLs = [
"http://www.yahoo.com/",
"http://www.stackoverflow.com/",
"http://www.sdfhksdjfksjdhg.net/"
];
var workingURLs = [];
var brokenURLs = [];
var promises = allURLs.map(url => validateUrl(url)
.then(res => (res?workingURLs:brokenURLs).push(url)));
Promise.all(promises).then(() => {
console.log(workingURLs, brokenURLs);
});
// ----
function validateUrl(url) {
return new Promise((ok, fail) => {
http.get(url, res => return ok(res.statusCode == 200))
.on('error', e => ok(false));
});
}
// Prevent nodejs from exit, don't need if any server listen.
var t = setTimeout(() => { console.log('Time is over'); }, 1000).ref();
You can use something like this (Not tested):
const arr = ["", "/a", "", ""];
Promise.all(arr.map(fetch)
.then(responses=>responses.filter(res=> res.ok).map(res=>res.url))
.then(workingUrls=>{
console.log(workingUrls);
console.log(arr.filter(url=> workingUrls.indexOf(url) == -1 ))
});
EDITED
Working fiddle (Note that you can't do request to another site in the browser because of Cross domain).
UPDATED with #vp_arth suggestions
const arr = ["/", "/a", "/", "/"];
let working=[], notWorking=[],
find = url=> fetch(url)
.then(res=> res.ok ?
working.push(res.url) && res : notWorking.push(res.url) && res);
Promise.all(arr.map(find))
.then(responses=>{
console.log('woking', working, 'notWorking', notWorking);
/* Do whatever with the responses if needed */
});
Fiddle

Resources