I cannot get url from a website using nodejs - node.js

I hope to scrape urls from this website by using the code below:
var request = require("request");
cheerio = require("cheerio");
urls = [];
request("http://news.sabay.com.kh/topics/sport", function(err, resp, body){
if(!err && resp.statusCode ==200){
var $ = cheerio.load(body);
$(".article","h4.title").each(function(){
var url = this.attr("href");
urls.push(url);
});
console.log(urls);
}
});
but I cannot get the result. When I run I got this
$ node server.js
[]

First, use a proper CSS selector :
.article h4.title > a
Then, use the proper field :
var url = this.attribs.href
Which gives :
var request = require("request");
cheerio = require("cheerio");
urls = [];
request("http://news.sabay.com.kh/topics/sport", function(err, resp, body){
if(!err && resp.statusCode ==200){
var $ = cheerio.load(body);
$(".article h4.title > a").each(function(){
var url = this.attribs.href;
urls.push(url);
});
console.log(urls);
}
});
and outputs :
[ 'http://news.sabay.com.kh/article/546826',
'http://news.sabay.com.kh/article/546763',
'http://news.sabay.com.kh/article/546520',
'http://news.sabay.com.kh/article/546568',
'http://news.sabay.com.kh/article/546460',
'http://news.sabay.com.kh/article/546448',
'http://news.sabay.com.kh/article/545674',
'http://news.sabay.com.kh/article/546235',
'http://news.sabay.com.kh/article/545698',
'http://news.sabay.com.kh/article/546091' ]

Related

Requests suddenly not working despite no apparent change from scraping source or code

I am currently trying to get better at scraping in JS and use request and cheerio. About two weeks ago I got a basic amazon scrape to work but this morning when I loaded my files it's no longer working. I made sure Cheerio and Request was installed on node and tried picking up requests from wikipedia and it worked fine. On Amazon my original source the code no longer works. Nothing on their webpage seems to have changed so I have no clue why none of my targets are working.
const request = require('request');
const cheerio = require('cheerio');
request(`http://amazon.com/dp/B07R7DY911`, (error,response,html) =>{
if (!error && response.statusCode ==200) {
const $ = cheerio.load(html);
const productTitle = $("#productTitle").html()
const price = $("#priceblock_ourprice").text();
const rating = $('#centerCol #acrPopover').text().replace(/\s\s+/g, '');
const numReviews = $('#centerCol #acrCustomerReviewText').text().replace(/\s\s+/g, '');
const prodImg = $('#landingImage').attr('data-old-hires');
console.log(productTitle);
console.log(price);
console.log(rating);
console.log(numReviews);
console.log(prodImg)
} else {
console.log(error);
}
})
Some playing around and I get null and undefined where I simply didn't before.
Help me stack overflow. You're my only hope!
Update:
Switched code to axios. Much better now.
app.get("/",(req,res)=>{
axios.get(`${link}`)
.then((response)=> {
const html = response.data;
const $ = cheerio.load(html);
const productName = $("#productTitle").html().replace(/\s\s+/g, '');
const amznPrice = $("#priceblock_ourprice").text();
const rating = $('#centerCol #acrPopover').text().replace(/\s\s+/g, '');
const numReviews = $('#centerCol #acrCustomerReviewText').text().replace(/\s\s+/g, '');
const prodImg = $('#landingImage').attr('data-old-hires');
res.render("home", {
productTitle: productName,
price:amznPrice,
prod_Img:prodImg,
azLink:links,
});
});
});
It appears that you're getting a compressed output in a format that the request() library does not understand. If you add the gzip: true option in the request() call, then the code starts working for me.
const request = require('request');
const cheerio = require('cheerio');
request({url: 'http://amazon.com/dp/B07R7DY911', gzip: true}, (error,response,html) => {
if (!error && response.statusCode == 200) {
const $ = cheerio.load(html);
const productTitle = $("#productTitle").html()
const price = $("#priceblock_ourprice").text();
const rating = $('#centerCol #acrPopover').text().replace(/\s\s+/g, '');
const numReviews = $('#centerCol #acrCustomerReviewText').text().replace(/\s\s+/g, '');
const prodImg = $('#landingImage').attr('data-old-hires');
console.log("productTitle", productTitle);
console.log("price", price);
console.log("rating", rating);
console.log("numReviews", numReviews);
console.log("prodImg", prodImg)
} else {
console.log(error);
}
});

Cheerio returns undefined when using the "contains" selector

I am currently trying to parse some HTML from this URL:
The main information I am after is the listed Weight. Using the Console in Chrome, I can issue the command:
$("th:contains(Weight)").parent()[0];
And it will give me the table rows containing all the information I need about the weight.
I tried to use this in Cheerio, but it just returns undefined.
This is my Node.js code:
var needle = require('needle');
var cheerio = require('cheerio');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
var test = $("th:contains(Weight)").parent()[0];
console.log(test);
}).catch(function(error) {
console.log(error);
})
};
rei(893905);
What would be the best way to get the information I need from Rei's website in an automated manner?
Try this:
var needle = require('needle');
var cheerio = require('cheerio');
var fs = require('fs');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
// your data in script
var content = $('script[data-client-store="product-details"]').html();
content = JSON.parse(content);
for (var spec of content.specs) {
if (spec.name == 'Weight') {
console.log(spec.values)
}
}
}).catch(function(error) {
console.log(error);
})
};
rei(893905);

Sort scraped Data into table on server page

Hi Im working on a scraper script so far Ive been able scrape from 2 elements . At this testing state I do not have a database setup thus far. So I thought Id just Sort this straight to my server page. This is my working code
var http = require('http');
var request = require('request');
var cheerio = require('cheerio');
http.createServer(function (req, res) {
request('http://www.xscores.com/soccer', function (error, response,
html) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(html);
var HomeTeam = "";
var AwayTeam = "";
$('div.score_home_txt.score_cell.wrap').each(function (i, element) {
var a = $(this).text();
var a = a.toLowerCase();
HomeTeam += "<tr><td>" + a + "</td>";
//console.log(a);
});
$('div.score_away_txt.score_cell.wrap').each(function (i, element) {
var b = $(this).text();
var b = b.toLowerCase();
AwayTeam += "<td>" + b + "</td><tr>";
//console.log(b);
});
var html = "<table><th>" + "HomeTeam</th><th>AwayTeam</th>" + HomeTeam + AwayTeam + "</table>"
res.writeHead(200, {
'Content-Type': 'text/html'
});
res.end(html);
}
});
}).listen(8080);
console.log('Server is running at http://178.62.253.206:8080/');
The plan was to sort this in a table with 2 Columns Home in Col A and Away in ColB, But im a little unsure how to write this so it gets sorted correctly.
The code above sort this into a single row. Ive tried a few different approaches but haven figured out the correct way yet :/
Any help would be much appreciated
Frederik
You need to find a common parent, looking at the website you are scraping .score_line looks like a reasonable option
// assume we're always going to return html
res.set('Content-Type', 'text/html');
// hit API
request('http://www.xscores.com/soccer', (err, response, html) => {
if (err || response.statusCode !== 200) {
// log error internally
console.error(err ? err.message : `API status code: ${response.statusCode}`);
// return client response
return res.status(500).send('<b>Internal Server Error</b>');
}
const $ = cheerio.load(html);
const rows = [];
// find each row
$('.score_line').each((i, el) => {
// extract each column
const homeScore = el.find('.score_home.score_cell.wrap').text().toLowerCase();
const awayScore = el.find('.score_away.score_cell.wrap').text().toLowerCase();
// build row
rows.push(`<tr><td>${homeScore}</td><td>${awayScore}</td></tr>`);
});
// build & send table
res.send(`<table>${rows.join('')}</table>`);
});

WebScraping & web navigation simulation

I'm make a webscraper and I already know how to scrap some data and convert them to Json with this code I made :
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var url = 'http://www.footmercato.net/';
request(url, function(err, resp, body) {
if (!err) {
var $ = cheerio.load(body);
var data = [];
var i = 1;
$('.text').each(function(i, element) {
var article = $('p');
var jsObject = { title : "", article : "", date : "" };
var articleTxt = article.text();
jsObject.article = articleTxt;
data.push(jsObject);
})
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
})
}
});
app.listen('8080');
But I would like to navigate to the website I'm scraping, fill out form and going to others pages.
Does somebody know if i can do it with cheerio or how I can add it to my existing code ?
Thanks
You can use webdriverio actually he will open a browser window, and then you can manipulate the dom through the webdriverio api to handle forms mouse clicks, and navigate from one page to an other.
var webdriverio = require('webdriverio');
var options = {
desiredCapabilities: {
browserName: 'firefox'
}
};
webdriverio
.remote(options)
.init()
.url('http://www.google.com')
.getTitle().then(function(title) {
console.log('Title was: ' + title);
})
.end();

how to created array global? in nodejs

do i need to keep each link in a array within "request({..})", and then display it or work on the outside of "request({..})", this would be my code but does not work, any idea?
var request = require("request");
var cheerio = require("cheerio");
var arrayLinks = [];
request({
uri: "http://www.some-url.com",
}, function(error, response, body) {
var $ = cheerio.load(body);
$("a").each(function() {
var link = $(this);
arrayLinks.push(link.attr("href"));
});
});
arrayLinks.forEach(function(link){console.log(link)});
For example:
var request = require("request");
var cheerio = require("cheerio");
var arrayLinks = [];
request({
uri: "http://www.some-url.com",
}, function(error, response, body) {
// Some logic.
linkTheArray()
});
function linkTheArray() {
arrayLinks.forEach(function(link){console.log(link)});
}
Now you can run it after the request is done. There is one other way, but it is pretty ugly. You can run a timeout function, until you get some data in the array

Resources