How do I scrape a url from xml in Node.js? - node.js

My end goal is to have my app display thumbnails of the X most recent images from a given user's 500px.com account (it's a photography site). The site doesn't have an API, as far as I can tell, but it does have an rss feed for individual users, i.e. https://500px.com/janedoe/rss, that spits out xml.
Using xml2js, I can parse the xml into a js object and navigate to the "description" container that has the html that contains the url I want, like so (this is just a proof of concept using the first item in the rss feed):
var express = require('express');
var router = express.Router();
var request = require('request');
var parseString = require('xml2js').parseString;
var EventEmitter = require('events').EventEmitter;
var body = new EventEmitter();
/* GET home page. */
router.get('/', function(req, res, next) {
request("https://500px.com/janedoe/rss", function(error, response, data) {
body.data = data;
body.emit('update');
});
body.on('update', function() {
parseString(body.data, function (err, result) {
var photoLink = result.rss.channel[0].item[0].description[0];
res.render('index', { title: 'Express', photoName});
});
});
});
This puts the entire html contents of the "!CDATA" tag in the photoLink variable. What I want to do is target what's in img src within that html so I can pass the url as a string to be rendered on the page.
I can envision using string methods to look for the first "img src" tag and then read in until the end of the address, but are there more elegant and easy ways to do this?

Try this: in this example, i find all the image urls
const transform = require('camaro')
const cheerio = require('cheerio')
const xml = require('fs').readFileSync('feed.xml', 'utf-8')
const template = {
data: ['//item/description', '.']
}
const result = transform(xml, template)
const links = result.data.map(html => {
const $ = cheerio.load(html)
const links = $('img')
const urls = []
$(links).each(function(i, link) {
urls.push($(link).attr('src'))
})
return urls
})
console.log(links)
Output:
[ [ 'https://drscdn.500px.org/photo/629350/m%3D900/v2?webp=true&sig=4a9fa5788049efb196917cc3f1a55601af901c7157b59ec86c8aa3378c6ee557' ],
[ 'https://drscdn.500px.org/photo/625259/m%3D900/v2?webp=true&sig=55eab44535f05625ad25dae3e805b2559c1caeb4c97570d04ee0a77c52c7fb19' ],
[ 'https://drscdn.500px.org/photo/625253/m%3D900/v2?webp=true&sig=174d1b27e6f87e0a98192cf6ae051301681a51beb7297df9733956d2763af163' ],
[ 'https://drscdn.500px.org/photo/509064/m%3D900/v2?webp=true&sig=698e56114e1d8b67ad11823390f8456ae723d3a389191c43192718f18213caa8' ],
[ 'https://drscdn.500px.org/photo/509061/m%3D900/v2?webp=true&sig=2998212f82a1c3428cebb873830a99b908f463474045d4e5ebba3257808685dd' ],
[ 'https://drscdn.500px.org/photo/509060/m%3D900/v2?webp=true&sig=8082904fe1935c51fc301a0d10529475ee15124d3797f69cbaeac3fd6c5f0dcb' ],
[ 'https://drscdn.500px.org/photo/509056/m%3D900/v2?webp=true&sig=4b85086a7bf55709e77febb202636b0e09415c8ca3fc3657bfb889ad827b3cab' ] ]

You don't need a full parser for this just do it with regex :
var links = [];
var re = new RegExp("<img.*?src=['\"](.*?)['\"].*?>", "gmi");
var res;
while(res = re.exec(body)) links.push(res[1]);
Example :
var a = '<div class="quote"><div class="quote-profile"><img alt="voyages-sncf.com logo" class="img-responsive img-circle" style="height: 80px" src="/img/app_website/index/logo.jpg"> </div><!--//profile--><img alt="voyages-sncf.com logo" class="img-responsive img-circle" style="height: 80px" src="/img/app_website/index/logo2.jpg" data-attr = "lkjlk"/>'
var links = [];
var re = new RegExp("<img.*?src=['\"](.*?)['\"].*?>", "gmi");
var res;
while(res = re.exec(a)) links.push(res[1]);
//["/img/app_website/index/logo.jpg", "/img/app_website/index/logo2.jpg"]

Related

All my scraped text ends up in one big object instead of separate objects with Cheerio

I'm following a web scraping course that uses Cheerio. I practice on a different website then they use in the course and now I run into the problem that all my scraped text end up in one big object. But every title should end up in it's own object. Can someone see what I did wrong? I already bumbed my head 2 hours on this problem.
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.aanbod").each((index, element) => {
const result = $(element).children(".item");
const title = result.find("h2").text().trim();
const characteristics = result.find("h4").text();
const scrapeResult = {title, characteristics};
scrapeResults.push(scrapeResult);
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();
This is the link to the repo: https://github.com/danielkroon/huurgoed-scraper/blob/master/index.js
Thanks!
That is because of the way you used selectors. I've modified your script to fetch the content as you expected. Currently the script is collecting titles and characteristics. Feel free to add the rest within your script.
This is how you can get the required output:
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.item").each((index, element) => {
const title = $(element).find(".kenmerken > h2").text().trim();
const characteristics = $(element).find("h4").text().trim();
scrapeResults.push({title,characteristics});
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();

Scrape paginate using nodejs, cheerio

How can I scrape data from a pagination ?
My code is work well with one pages, but I need to scrap all data from page 2, page 3 ... and push into an ebooks array.
Here is my code
function searchEbooks(query) {
return fetch(getUrl(1, query))
.then(res => res.text())
.then(body => {
const ebooks = [];
$('article').each(function(i, element) {
const $element = $(element);
const $title = $element.find('.entry-title a');
const $image = $element.find('.attachment-post-thumbnail');
const $description = $element.find('.entry-summary');
const authors = [];
$(element).find('.entry-author a').each(function(i, element) {
author = $(element).text();
authors.push(author);
});
const ebook = {
image: $image.attr('src'),
title: $title.text(),
description: $description.text(),
authors: authors,
}
ebooks.push(ebook);
});
return ebooks;
});
}
I have no idea how to do this. Please give me a hint or an example.
I use cherrio, node-fetch packages.
Thank you.
Try this to get next url:
var href = $('.current+a').attr('href');
if(href){
// you can check this url
} else {
console.log('You get all page');
}

Cheerio returns undefined when using the "contains" selector

I am currently trying to parse some HTML from this URL:
The main information I am after is the listed Weight. Using the Console in Chrome, I can issue the command:
$("th:contains(Weight)").parent()[0];
And it will give me the table rows containing all the information I need about the weight.
I tried to use this in Cheerio, but it just returns undefined.
This is my Node.js code:
var needle = require('needle');
var cheerio = require('cheerio');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
var test = $("th:contains(Weight)").parent()[0];
console.log(test);
}).catch(function(error) {
console.log(error);
})
};
rei(893905);
What would be the best way to get the information I need from Rei's website in an automated manner?
Try this:
var needle = require('needle');
var cheerio = require('cheerio');
var fs = require('fs');
function rei(product) {
//Request page from rei.com and follow the redirect
return needle("get", "https://rei.com/product/" + product, {
follow_max: 5
}).then(function(response) {
var $ = cheerio.load(response.body);
// your data in script
var content = $('script[data-client-store="product-details"]').html();
content = JSON.parse(content);
for (var spec of content.specs) {
if (spec.name == 'Weight') {
console.log(spec.values)
}
}
}).catch(function(error) {
console.log(error);
})
};
rei(893905);

NodeJS: Use a pug template to display results

I have a method in a NodeJS app that handles scraping a URL, and when successful, saving that data in a Mongo database, and showing the results.
Main method:
//url parameter
app.get('/urls/', function(req, res) {
var client = new MetaInspector(req.query.url, {
timeout: 5000
});
client.on("fetch", function() {
var imagesArray = [];
var keywordsArray = [];
var now = new Date();
var dateVal = dateFormat(now, "mm/dd/yyyy h:MM:ss");
for (var i = 0; i < client.images.length; i++) {
// we only want jpgs. nothing else.
if (client.images[i].indexOf('.jpg') > -1) {
imagesArray.push({
"image": client.images[i]
})
}
}
for (var i = 0; i < client.keywords.length; i++) {
keywordsArray.push({
"keyword": client.keywords[i]
})
}
var newUrls = Urls({
url: client.url,
date_added: dateVal,
keywords: req.body.keywords,
author: client.author,
description: client.description,
ogTitle: client.ogTitle,
ogDescription: client.ogDescription,
image: client.image,
images: imagesArray,
keywords: keywordsArray
});
newUrls.save(function(err) {
if (err) throw err;
res.send('Success' + newUrls);
});
});
client.on("error", function(err) {
console.log(err);
});
client.fetch();
});
This all works well and good. But I'm using Pug and Express and have specific routes setup. I'd like instead of sending the newUrls obj to the res.send, have it go to a particular route and pass it to a particular pug template I already have setup:
// Route.js
var express = require('express');
var router = express.Router();
var Urls = require('../models/urlModel');
var Footer = require('../models/footerModel');
/* URL Saved Success Page */
router.get('/saved', function (req, res) {});
});
module.exports = router;
My view lives in a pug file located at:
/views/saved.pug
div#body
include nav.pug
div.container.item-container
div.row
div.col-md-8
h1 Item successfully saved.
h5 {item}
h6 {description}
I've tried using the res.send method, but that doesn't work. Any suggestions on how to handle this?
For my understanding, you want the request redirected to /saved with payload after urls saved to database, in this scenario, you could user res.redirect with query string
newUrls.save(function(err){
var payload = JSON.stringify({
url: client.url,
date_added: dateVal,
keywords: req.body.keywords,
author: client.author,
description: client.description,
ogTitle: client.ogTitle,
ogDescription: client.ogDescription,
image: client.image,
images: imagesArray,
keywords: keywordsArray
})
//append the payload as a query string
res.redirect(`/saved?payload=${payload}`)
})
and in /saved route, you could parse the query and use res.render
router.get('/saved', function (req, res) {});
let payload = JSON.parse(req.query.payload);
if(payload){
res.render('saved', payload)
}
});

WebScraping & web navigation simulation

I'm make a webscraper and I already know how to scrap some data and convert them to Json with this code I made :
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
var url = 'http://www.footmercato.net/';
request(url, function(err, resp, body) {
if (!err) {
var $ = cheerio.load(body);
var data = [];
var i = 1;
$('.text').each(function(i, element) {
var article = $('p');
var jsObject = { title : "", article : "", date : "" };
var articleTxt = article.text();
jsObject.article = articleTxt;
data.push(jsObject);
})
var json = JSON.stringify(data);
fs.writeFile('output.json', JSON.stringify(json, null, 4), function(err) {
console.log('File successfully written!');
})
}
});
app.listen('8080');
But I would like to navigate to the website I'm scraping, fill out form and going to others pages.
Does somebody know if i can do it with cheerio or how I can add it to my existing code ?
Thanks
You can use webdriverio actually he will open a browser window, and then you can manipulate the dom through the webdriverio api to handle forms mouse clicks, and navigate from one page to an other.
var webdriverio = require('webdriverio');
var options = {
desiredCapabilities: {
browserName: 'firefox'
}
};
webdriverio
.remote(options)
.init()
.url('http://www.google.com')
.getTitle().then(function(title) {
console.log('Title was: ' + title);
})
.end();

Resources