Node.js requests and cheerio output blank - node.js

I'm learning scraping using node.js requests and cheerio. I write a simple code to display title from a web page.
My code :
const request = require("request");
const cheerio = require("cheerio");
const url = "https://singapore.craigslist.org/d/automotive-services/search/aos"
async function scrapeCraigslist() {
try {
const htmResult = await request.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const title = $(element)
.children(".result-title")
.text();
console.log(title);
console.log("sk");
});
} catch (err) {
console.error(err);
}
}
scrapeCraigslist();
But when i run the code i'm getting blank nothing errors and no ouput.
Output :
Microsoft Windows [Version 10.0.18362.720]
(c) 2019 Microsoft Corporation. All rights reserved.
C:\Users\Ahmed-PC\craigslist>node index.js
C:\Users\Ahmed-PC\craigslist>
My selection and result is coming in Chrome Developer Tools console. but not coming in node.js code

You're using request with a promise style interface, if you wish to do this you'll need to use request-promise (or you could use Axios, node-fetch etc.).
If you use request-promise your code should work fine:
request-promise
const request = require("request");
const cheerio = require("cheerio");
const rp = require("request-promise");
const url = "https://singapore.craigslist.org/d/automotive-services/search/aos"
async function scrapeCraigslist() {
try {
const htmResult = await rp.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const title = $(element)
.children(".result-title")
.text();
console.log(title);
console.log("sk");
});
} catch (err) {
console.error(err);
}
}
scrapeCraigslist();
request (with callback)
const request = require("request");
const cheerio = require("cheerio");
const url = "https://singapore.craigslist.org/d/automotive-services/search/aos"
async function scrapeCraigslist() {
request.get(url, async (error, response, htmResult) => {
if (error) {
// Something went wrong
console.error(error);
} else {
// The request was successful
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const title = $(element)
.children(".result-title")
.text();
console.log(title);
console.log("sk");
});
}
});
}
scrapeCraigslist();

Related

Async function to scrape subreddits using Cheerio returns undefined

The script by itself works great (entering the url manually, writing a json file using the fs module, node script_name.js) but within a Express get request it returns undefined.
So I've built a simple frontend to let the user enter the subreddit name to be scraped.
And here's where the problem is:
Express controller
const run = require("../run");
requestPosts: async (req, res) => {
try {
const { subreddit } = req.body;
const response = await run(subreddit);
//console.log(response);
res.json(response);
} catch (error) {
console.error(error);
}
},
Cheerio functions
const axios = require("axios");
const { load } = require("cheerio");
let posts = [];
async function getImage(postLink) {
const { data } = await axios(postLink);
const $ = load(data);
return $("a.post-link").attr("href");
}
async function run(url) {
try {
console.log(url);
const { data } = await axios(url);
const $ = load(data);
$(".thing.linkflair.link").map(async (i, e) => {
const title = $(e)
.find(".entry.unvoted .top-matter .title .title")
.text();
const user = $(e)
.find(".entry.unvoted .top-matter .tagline .author")
.text();
const profileLink = `https://old.reddit.com/user/${user}`;
const postLink = `https://old.reddit.com/${$(e).find("a").attr("href")}`;
// const thumbail = $(e).find("a img").attr("src");
const image = await getImage(postLink);
posts.push({
id: i + 1,
title,
postLink,
image,
user: { user, profileLink },
});
});
const nextPage = $(".next-button a").attr("href");
if (nextPage) {
await run(nextPage);
} else {
return posts;
}
} catch (error) {
console.error(error);
}
}
module.exports = run;
I've tried working with Promise((resolve, reject) => {}).
I think it's returning undefined because maybe the code its not synchronized.
(idk if it makes sense, i've just started programming)
.map() is not promise-aware and does not wait for your promises to finish. So, $(".thing.linkflair.link").map() finishes long before any of the asynchronous functions inside its callback do. Thus you try to return posts BEFORE it has been populated.
Passing an async callback to .map() will return an array of promises. You can use Promise.all() on those promises to know when they are done and once you're doing that, you may as well just return each post object rather that using a higher level scoped/shared object, thus making the code more self contained.
I would suggest this code:
async function run(url) {
try {
console.log(url);
const { data } = await axios(url);
const $ = load(data);
const posts = await Promise.all($(".thing.linkflair.link").map(async (i, e) => {
const title = $(e)
.find(".entry.unvoted .top-matter .title .title")
.text();
const user = $(e)
.find(".entry.unvoted .top-matter .tagline .author")
.text();
const profileLink = `https://old.reddit.com/user/${user}`;
const postLink = `https://old.reddit.com/${$(e).find("a").attr("href")}`;
// const thumbail = $(e).find("a img").attr("src");
const image = await getImage(postLink);
// return a post object
return {
id: i + 1,
title,
postLink,
image,
user: { user, profileLink },
};
}));
const nextPage = $(".next-button a").attr("href");
if (nextPage) {
const newPosts = await run(nextPage);
// add these posts to the ones we already have
posts.push(...newPosts);
}
return posts;
} catch (error) {
console.error(error);
}
}

how to make web-scraping using cheerio

I've tried this code
const cheerio = require("cheerio");
const axios = require('axios');
async function getProducts() {
try{
const res = await axios.get('https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup');
const html = await res.data;
const $ = cheerio.load(html);
const products = [];
$('ul[data-testid]').each((i, el) => {
const title = $(el).find('a[data-testid="product_name"]').text().trim();
const price = $(el).find('div[data-testid="product_price"] .css-fzp91j').text().trim();
products.push({ title, price });
});
console.log(products);
}catch(err){
console.log(err)
}
};
getProducts();
I need the product list array containing title and price but this code returning me empty array. What to do for getting these details? Example link: https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup.
Amazon work but this carefour website not working for web scraping!
const cheerio = require("cheerio");
const axios = require('axios');
async function getProducts() {
try{
const res = await axios.get('https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup');
const html = await res.data;
const $ = cheerio.load(html);
const products = [];
$('ul[data-testid]').each((i, el) => {
const title = $(el).find('a[data-testid="product_name"]').text().trim();
const price = $(el).find('div[data-testid="product_price"] .css-fzp91j').text().trim();
products.push({ title, price });
});
console.log(products);
}catch(err){
console.log(err)
}
};
getProducts();
Tried this and expecting to get details and price of products using cheerio- Nodejs

How to use image4io on node js

Im having problem following the documentation on image4io.
I just tried stuff. So the problem is that the image4io only returns Promise {}
here is my code
module.exports = function(app,db){
const Image4ioAPI = require('#image4io/image4ionodejssdk');
app.post('/addImage',(req,res)=>{
var apiKey = 'api key';
var apiSecret = 'api secret';
var api = new Image4ioAPI.Image4ioClient(apiKey, apiSecret);
let response=api.GetSubscription();
console.log(response)
})
}
It returns a promise you must use then or catch to access the response after the promise is fulfilled.
Ideal use case:
api.GetSubscription()
.then(response => {
// Response returns a JSON object, which can be accessed here
}).catch(error => {
throw error;
})
You can await a promise to show its result.
module.exports = function(app,db){
const Image4ioAPI = require('#image4io/image4ionodejssdk');
app.post('/addImage', async (req,res)=>{
var apiKey = 'api key';
var apiSecret = 'api secret';
var api = new Image4ioAPI.Image4ioClient(apiKey, apiSecret);
let response= await api.GetSubscription();
console.log(response)
})
}

I have a question about crawling. I wanna use the return value out of the funtion

I'm doing crawling with Node.js and I wanna use the return value (which is titleList) out of the function, but that doesn't work out of that function.
Please give me some advice about that.
const axios = require("axios");
const cheerio = require('cheerio');
async function getHTML()
{
try
{
return await axios.get("https://google.com");
} catch (error) {
console.error(error);
}
}
getHTML().then(html =>
{
let titleList = [];
const $ = cheerio.load(html.data);
// bodyList에 저장
const bodyList = $("ul.new_quickMenu_list");
bodyList.find("li").each(function(i, elem)
{
titleList[i] = {
title : $(this).find("span").text()
};
});
//console.log(titleList);
return titleList;
})

Write a loop around async await that will read and write a files in parallel?

I'm using fs and phantomJS
const phantom = require('phantom');
const fs = require('fs');
I have 4 routes (urls) that get opened from phantom JS. When opened, the page content is read and then node.fs will write out that content into it's own html files.
const routes = [
'about',
'home',
'todo',
'lazy',
]
Question:
How do I loop over this async function for every value in const routes in parallel.
(async function() {
const instance = await phantom.create();
const page = await instance.createPage();
const status = await page.open(`http://localhost:3000/${routes}`);
const content = await page.property('content');
await fsPromise(`${routes}.html`, content);
await instance.exit();
}());
const fsPromise = (file, str) => {
return new Promise((resolve, reject) => {
fs.writeFile(file, str, function (err) {
if (err) return reject(err);
resolve(`${routes} > ${routes}.html`);
});
})
};
It took me a while to get this actually up and running in an environment that supports await and async. It turns out Node v7.5.0 supports them - way simpler than fighting with babel! The only other thorn in this investigation was that request-promise, which I was using to test, doesn't seem to fail gracefully when the promise isn't built properly. I saw a lot of errors like this when I tried to use await with it:
return await request.get(options).map(json => json.full_name + ' ' + json.stargazers_count);
^^^^^^^
SyntaxError: Unexpected identifier
In the end though, I realized that your promise function doesn't actually use async/await (which is why mine errored), so the premise should be the same. Here's the test that I got working — it's very similar to yours. The key is in the synchronous for() iteration:
var request = require('request-promise')
var headers = { 'User-Agent': 'YOUR_GITHUB_USERID' }
var repos = [
'brandonscript/usergrid-nodejs',
'facebook/react',
'moment/moment',
'nodejs/node',
'lodash/lodash'
]
function requestPromise(options) {
return new Promise((resolve, reject) => {
request.get(options).then(json => resolve(json.full_name + ' ' + json.stargazers_count))
})
}
(async function() {
for (let repo of repos) {
let options = {
url: 'https://api.github.com/repos/' + repo,
headers: headers,
qs: {}, // or you can put client_id / client secret here
json: true
};
let info = await requestPromise(options)
console.log(info)
}
})()
And while I can't test it, I'm pretty sure this will work:
const routes = [
'about',
'home',
'todo',
'lazy',
]
(async function() {
for (let route of routes) {
const instance = await phantom.create();
const page = await instance.createPage();
const status = await page.open(`http://localhost:3000/${route}`);
const content = await page.property('content');
await fsPromise(`${route}.html`, content);
await instance.exit();
}
}())
Since you're using ES7 syntax, you should also be able to get the fsPromise() function to perform without declaring a promise:
async const fsPromise = (file, str) => {
return await fs.writeFile(file, str)
}

Resources