scraping probsssss - node.js

I have two questions.
I am trying to get the value of propval, but when I run the scraper I just get an emptying string back. Should I use a different method
besides filter or am I not choosing the right element? I've tried
different elements, but it does the same thing.
Can I use a loop to go through every single table row to collect all the prop value ids or is there a more efficient way?
const express = require('express');
const fs = require('fs');
const request = require('request');
const cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res) {
url = 'http://streak.espn.com/en/';
request(url, function(error, response, html){
if(!error) {
var $ = cheerio.load(html);
var gameQuestion, propVal;
var json = { gameQuestion : "", propVal : ""};
$('.gamequestion').each(function(){
var data = $(this)
gameQuestion = data.text();
json.gameQuestion = gameQuestion;
})
$('a#.matchupDiv').each(function() {
var data = $(this);
propVal = data.text();
json.propVal = propVal;
})
}
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output.json file');
})
res.send('Check your console!')
});
})
app.listen('8081')
console.log('magic happens on port 8081');
exports = module.exports = app;
<tbody>
<tr>
<td rowspan = "2" class='mg-column1 start'></td>
<td rowspan = "2" class='mg-column2 start'></td>
<td rowspan = "2" class='mg-column3 start'></td>
<div class="mg-check" propval="m57207o58439" name="matchupDiv">nbsp;</div>

The jQuery .filter() function takes either an element, a selector, a function, or another jQuery object (http://api.jquery.com/filter/). Since you are passing a function into the filter, it is expecting you to return a boolean (a true/false value).
What you probably want to use is the .each() function. (http://api.jquery.com/jquery.each/). When you query a class, you will get back an array of matching objects. .each() will loop through the array and you could do what you are trying to do there. Give it a shot.
ADDED***** I was looking at the source code, and there is no class propval. You need to change your query to:
$('.mg-check').attr('propVal')
But .mg-check doesn't appear to have any children. What are you trying to scrape exactly?
EDITED *** To get an array of the propVals, try this:
Change your initialization of your json.propVal from '' to [], then...
$('.mg-check').each(function(){ json.propVal.push($(this).attr('propVal')) });

Related

Nodejs stream.pipe performing asynchronously

I have a large xml which is a combination of xml documents. I'm trying to use a nodejs xml splitter and respond back with the number of documents i have found.
My code looks something like this. I'm looking to get the number of documents outside the function(in the last line). Is there something I can do to achieve this?
var XmlSplit = require('./xmlsplitter.js')
const fs = require('fs')
var xmlsplit = new XmlSplit()
var no_of_docs = 0
var inputStream = fs.createReadStream('./files/input/test.xml')
inputStream.pipe(xmlsplit).on('data', function(data,callback) {
var xmlDocument = data.toString();
no_of_docs = no_of_docs + 1;
})
inputStream.pipe(xmlsplit).on('end', function(){
console.log('Stream ended');
console.log(no_of_docs); <-- This prints the correct value but the value is lost as soon as we exit this.
});
console.log("This is outside the function " + no_of_docs); <-- I need the value here.

Having difficulties with node.js res.send() loop

I'm attempting to write a very basic scraper that loops through a few pages and outputs all the data from each url to a single json file. The url structure goes as follows:
http://url/1
http://url/2
http://url/n
Each of the urls has a table, which contains information pertaining to the ID of the url. This is the data I am attempting to retrieve and store inside a json file.
I am still extremely new to this and having a difficult time moving forward. So far, my code looks as follows:
app.get('/scrape', function(req, res){
var json;
for (var i = 1163; i < 1166; i++){
url = 'https://urlgoeshere.com' + i;
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
var mN, mL, iD;
var json = { mN : "", mL : "", iD: ""};
$('html body div#wrap h2').filter(function(){
var data = $(this);
mN = data.text();
json.mN = mN;
})
$('table.vertical-table:nth-child(7)').filter(function(){
var data = $(this);
mL = data.text();
json.mL = mL;
})
$('table.vertical-table:nth-child(8)').filter(function(){
var data = $(this);
iD = data.text();
json.iD = iD;
})
}
fs.writeFile('output' + i + '.json', JSON.stringify(json, null, 4), function(err){
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
})
});
}
res.send(json);
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
When I run the code as displayed above, the output within the output.json file only contains data for the last url. I presume that's because I attempt to save all the data within the same variable?
If I include res.send() inside the loop, so the data writes after each page, I receive the error that multiple headers cannot be sent.
Can someone provide some pointers as to what I'm doing wrong? Thanks in advance.
Ideal output I would like to see:
Page ID: 1
Page Name: First Page
Color: Blue
Page ID: 2
Page Name: Second Page
Color: Red
Page ID: n
Page Name: Nth Page
Color: Green
I can see a number of problems:
Your loop doesn't wait for the asynchronous operations in the loop, thus you do some things like res.send() before the asynchronous operations in the loop have completed.
In appropriate use of cheerio's .filter().
Your json variable is constantly being overwritten so it only has the last data in it.
Your loop variable i would lose its value by the time you tried to use it in the fs.writeFile() statement.
Here's one way to deal with those issues:
const rp = require('request-promise');
const fsp = require('fs').promises;
app.get('/scrape', async function(req, res) {
let data = [];
for (let i = 1163; i < 1166; i++) {
const url = 'https://urlgoeshere.com/' + i;
try {
const html = await rp(url)
const $ = cheerio.load(html);
const mN = $('html body div#wrap h2').first().text();
const mL = $('table.vertical-table:nth-child(7)').first().text();
const iD = $('table.vertical-table:nth-child(8)').first().text();
// create object for this iteration of the loop
const obj = {iD, mN, mL};
// add this object to our overall array of all the data
data.push(obj);
// write a file specifically for this invocation of the loop
await fsp.writeFile('output' + i + '.json', JSON.stringify(obj, null, 4));
console.log('File successfully written! - Check your project directory for the output' + i + '.json file');
} catch(e) {
// stop further processing on an error
console.log("Error scraping ", url, e);
res.sendStatus(500);
return;
}
}
// send all the data we accumulated (in an array) as the final result
res.send(data);
});
Things different in this code:
Switch over all variable declarations to let or const
Declare route handler as async so we can use await inside.
Use the request-promise module instead of request. It has the same features, but returns a promise instead of using a plain callback.
Use the promise-based fs module (in latest versions of node.js).
Use await in order to serialize our two asynchronous (now promise-returning) operations so the for loop will pause for them and we can have proper sequencing.
Catch errors and stop further processing and return an error status.
Accumulate an object of data for each iteration of the for loop into an array.
Change .filter() to .first().
Make the response to the request handler be a JSON array of data.
FYI, you can tweak the organization of the data in obj however you want, but the point here is that you end up with an array of objects, one for each iteration of the for loop.
EDIT Jan, 2020 - request() module in maintenance mode
FYI, the request module and its derivatives like request-promise are now in maintenance mode and will not be actively developed to add new features. You can read more about the reasoning here. There is a list of alternatives in this table with some discussion of each one. I have been using got() myself and it's built from the beginning to use promises and is simple to use.

Click function not executing executing Nightmare.js

I'm trying to do some scraping with nightmare and my work is almost functional. The problem is that I'm facing an issue when I try to perform a click() after the evaluate() and run() have been called. After I run those two functions I'm trying to perform another click to move myself to another part of the website, but is not executing the click().
At this point I'm note sure whats the issue, I have few assumptions, maybe those functions are asynchronous and I'm trying to click() when the callbacks arent ready yet or one of those functions ends de current nightmare object and I dont have the scope anymore.
var Nightmare = require('nightmare');
//var nightmare = Nightmare({show:true})
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var urlWeb = "someurl";
var selectCity = "#ddl_city";
var selectTheater = "#ddl_theater";
var enterBtn = "#btn_enter";
var mainSelector = "#aspnetForm";
var flagReady = true;
new Nightmare({show:true})
.goto(urlWeb)
.wait(selectCity)
.select(selectCity, '19')
.wait(8000)
.select(selectTheater, '12')
.wait(1000)
.click(enterBtn)
.wait(mainSelector)
.evaluate(function(){
//returning HTML for cheerio
return document.body.innerHTML;
})
.run(function(err, nightmare){
if (err) return console.log(err);
// Loading HTML body on jquery cheerio
var $ = cheerio.load(nightmare);
//Looping on each div for seccion de Carterla para Hoy
$('.showtimeDaily').each(function(index, element){
//spanish title
console.log($(this).find('h3').children().text());
//english title
console.log($(this).find('h4').text());
//schedule for today
console.log($(this).find('li').children().text() + " ");
//img for movie
console.log($(this).find('img').attr('src'));
//show time data such as gender, lenght, language
console.log($(this).find('.showtimeData').text());
var showtimeData = $(this).find('.showtimeData').text();
//console.log(JSON.stringify(showtimeData.replace(/\t|\n/g, "")));
})
console.log('Done!');
})
//*****here is wen I try to click*****
.click('a[href="../showtimes/weekly.aspx"]');
I was having issue with the asynchronous call backs, so what I did is that I nested calls of the nightmare object to make sure that the tasks were running one after the other one. This is the code:
nightmare
.goto(urlWeb)
.wait(selectCity)
.select(selectCity, '19')
.wait(8000)
.select(selectTheater, '12')
.wait(1000)
.click(enterBtn)
.wait(mainSelector)
.evaluate(function(){
//returning HTML for cheerio
return document.body.innerHTML;
})
.then(function(body){
// Loading HTML body on jquery cheerio
var $ = cheerio.load(body);
//Looping on each div for seccion de Carterla para Hoy
$('.showtimeDaily').each(function(index, element){
//spanish title
console.log($(this).find('h3').children().text());
//english title
console.log($(this).find('h4').text());
//schedule for today
console.log($(this).find('li').children().text() + " ");
//img for movie
console.log($(this).find('img').attr('src'));
//show time data such as gender, lenght, language
console.log($(this).find('.showtimeData').text());
var showtimeData = $(this).find('.showtimeData').text();
//console.log(JSON.stringify(showtimeData.replace(/\t|\n/g, "")));
})
//**Here I call nightmare to run after the first call back is done*****
nightmare
.goto('')
.wait('body')
.title()
.then(function(title){
console.log(title);
});
console.log('Done!');
});

Node.js: given array of URLs, determine which are valid

I am a total scrub with the node http module and having some trouble.
The ultimate goal here is to take a huge list of urls, figure out which are valid and then scrape those pages for certain data. So step one is figuring out if a URL is valid and this simple exercise is baffling me.
say we have an array allURLs:
["www.yahoo.com", "www.stackoverflow.com", "www.sdfhksdjfksjdhg.net"]
The goal is to iterate this array, make a get request to each and if a response comes in, add the link to a list of workingURLs (for now just another array), else it goes to a list brokenURLs.
var workingURLs = [];
var brokenURLs = [];
for (var i = 0; i < allURLs.length; i++) {
var url = allURLs[i];
var req = http.get(url, function (res) {
if (res) {
workingURLs.push(?????); // How to derive URL from response?
}
});
req.on('error', function (e) {
brokenURLs.push(e.host);
});
}
what I don't know is how to properly obtain the url from the request/ response object itself, or really how to structure this kind of async code - because again, I am a nodejs scrub :(
For most websites using res.headers.location works, but there are times when the headers do not have this property and that will cause problems for me later on. Also I've tried console logging the response object itself and that was a messy and fruitless endeavor
I have tried pushing the url variable to workingURLs, but by the time any response comes back that would trigger the push, the for loop is already over and url is forever pointing to the final element of the allURLs array.
Thanks to anyone who can help
You need to closure url value to have access to it and protect it from changes on next loop iteration.
For example:
(function(url){
// use url here
})(allUrls[i]);
Most simple solution for this is use forEach instead of for.
allURLs.forEach(function(url){
//....
});
Promisified solution allows you to get a moment when work is done:
var http = require('http');
var allURLs = [
"http://www.yahoo.com/",
"http://www.stackoverflow.com/",
"http://www.sdfhksdjfksjdhg.net/"
];
var workingURLs = [];
var brokenURLs = [];
var promises = allURLs.map(url => validateUrl(url)
.then(res => (res?workingURLs:brokenURLs).push(url)));
Promise.all(promises).then(() => {
console.log(workingURLs, brokenURLs);
});
// ----
function validateUrl(url) {
return new Promise((ok, fail) => {
http.get(url, res => return ok(res.statusCode == 200))
.on('error', e => ok(false));
});
}
// Prevent nodejs from exit, don't need if any server listen.
var t = setTimeout(() => { console.log('Time is over'); }, 1000).ref();
You can use something like this (Not tested):
const arr = ["", "/a", "", ""];
Promise.all(arr.map(fetch)
.then(responses=>responses.filter(res=> res.ok).map(res=>res.url))
.then(workingUrls=>{
console.log(workingUrls);
console.log(arr.filter(url=> workingUrls.indexOf(url) == -1 ))
});
EDITED
Working fiddle (Note that you can't do request to another site in the browser because of Cross domain).
UPDATED with #vp_arth suggestions
const arr = ["/", "/a", "/", "/"];
let working=[], notWorking=[],
find = url=> fetch(url)
.then(res=> res.ok ?
working.push(res.url) && res : notWorking.push(res.url) && res);
Promise.all(arr.map(find))
.then(responses=>{
console.log('woking', working, 'notWorking', notWorking);
/* Do whatever with the responses if needed */
});
Fiddle

get an array of elements from findElement(By.className())

I am writing a script in node.js for selenium that will go to a page and grab the innerhtml of a certain css class and store them in an array.
var element = driver.findElement(By.className("hll"));
element.getInnerHtml().then(html){
//returns 1 of the elements html where I want multiples
}
To retrieve the html of multiple elements, you can use driver.findElements() to find all matches elements. This will provider a Promise that resolves with the elements in an Array.
var pendingElements = driver.findElements(By.className('h11'))
pendingElements.then(function (elements) {
// ...
});
You'll need to iterate over the collection and request each element's HTML. You can use the Array's .map() to create a collection of promises from getInnerHtml():
var pendingHtml = elements.map(function (elem) {
return elem.getInnerHtml();
});
To wait for them to be resolved, you can pass the collection to promise.all().
promise.all(pendingHtml).then(function (allHtml) {
// ...
});
Note, you'll need a reference to Selenium's promise for that.
var promise = require('selenium-webdriver').promise;
Combined:
// ...
var promise = require('selenium-webdriver').promise;
var pendingElements = driver.findElements(By.className('h11'))
pendingElements.then(function (elements) {
var pendingHtml = elements.map(function (elem) {
return elem.getInnerHtml();
});
promise.all(pendingHtml).then(function (allHtml) {
// `allHtml` will be an `Array` of strings
});
});

Resources