how to make web-scraping using cheerio - node.js

I've tried this code
const cheerio = require("cheerio");
const axios = require('axios');
async function getProducts() {
try{
const res = await axios.get('https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup');
const html = await res.data;
const $ = cheerio.load(html);
const products = [];
$('ul[data-testid]').each((i, el) => {
const title = $(el).find('a[data-testid="product_name"]').text().trim();
const price = $(el).find('div[data-testid="product_price"] .css-fzp91j').text().trim();
products.push({ title, price });
});
console.log(products);
}catch(err){
console.log(err)
}
};
getProducts();
I need the product list array containing title and price but this code returning me empty array. What to do for getting these details? Example link: https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup.
Amazon work but this carefour website not working for web scraping!
const cheerio = require("cheerio");
const axios = require('axios');
async function getProducts() {
try{
const res = await axios.get('https://www.carrefouruae.com/mafuae/en/v4/search?keyword=tomato%20kathcup');
const html = await res.data;
const $ = cheerio.load(html);
const products = [];
$('ul[data-testid]').each((i, el) => {
const title = $(el).find('a[data-testid="product_name"]').text().trim();
const price = $(el).find('div[data-testid="product_price"] .css-fzp91j').text().trim();
products.push({ title, price });
});
console.log(products);
}catch(err){
console.log(err)
}
};
getProducts();
Tried this and expecting to get details and price of products using cheerio- Nodejs

Related

How to use Node.js crawler Web

I expecting the product info will be printed when displayed. However, the current code will show all items loaded even if they're not shown yet.
How do i modify my code, thank you
// const request = require("request");
const cheerio = require("cheerio");
const puppeteer = require('puppeteer');
(async () => {
const browser = await puppeteer.launch({
headless: false // 無外殼的 Chrome,有更佳的效能
});
const page = await browser.newPage();
await page.goto('https://www.balenciaga.com/en-us/women/shoes/sneakers');
await getData(page)
await scrollItem(page)
})();
const scrollItem = async (page) => {
pageHeight = await page.evaluate('document.body.scrollHeight')
await page.evaluate('window.scrollTo(0, document.body.scrollHeight)',
await page.waitForFunction(`document.body.scrollHeight > ${pageHeight}`),
await getData(page)
)
}
const getData = async (page) => {
let body = await page.content()
let $ = await cheerio.load(body)
const data = []
const list = $(".l-productgrid__item .c-product__infos");
for (let i = 0; i < list.length; i++) {
const title = list.eq(i).find('.c-product__infos h2').text();
const price = list.eq(i).find('.c-product__infos p').text().trim();
data.push({ title, price });
}
data.forEach((res, i) => {
console.log(`${i+1} 名稱: ${res.title}, 價錢: ${res.price}`)
})
await scrollItem(page)
}
working code:
// define function which accepts body and cheerio as args
function extract(input, cheerio) {
// return object with extracted values
let $ = cheerio.load(input);
return $('.l-productgrid__item .c-product__infos').map(function() {
return {
header: $('h2', this).text().trim(),
price: $('p', this).text().trim()
}
}).toArray()
}
proof of work (screenshot)

I'm trying to scrap some images from a website and for some reason it does not return the image URL

This is my code I'm using cheerio
const url = 'https://kbdfans.com/collections/ready-to-use'
axios(url).then(response => {
const html = response.data
const $ = cheerio.load(html)
const arrayData = []
$('.product-block.one-fifth.small-down--one-whole.grid-flex__item').each(function(index){
const description = $(this).find('a.product-block__title-link').text()
const price = $(this).find('span.theme-money').text()
const img = $(this).find('img.rimage__image.fade-in.lazyautosizes.lazyloaded').attr('srcset')
const id = index
arrayData.push({
id,
description,
price,
img
})
})
const result = {
arrayData
}
console.log(arrayData)
}).catch(err => {console.log(err)})
This is the image that I want to obtain.

Scraper with Puppeteer login returns just one element of the array

This code is supposed to loop through the urls that get scraped from the scrapeProductPage function. But before looping, it needs to log in so that it can obtain the prices. The prices are only displayed to logged in users. Instead of looping through the urls it just returns the scraped data from one page. The error I get is "MaxListenersExceededWarning: Possible EventEmitter memory leak detected".
const request = require("request-promise");
const cheerio = require("cheerio");
const ObjectsToCsv = require("objects-to-csv");
const puppeteer = require('puppeteer');
const url = "https://www.example.com";
const scrapeResults = [];
async function scrapeProductPage() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("td.productListing-data > a[style='position:relative;float:left;']").each((index, element) => {
let url = $(element).attr("href");
url = "https\://www.example.com/" + url;
const scrapeResult = { url };
scrapeResults.push(scrapeResult);
});
return scrapeResults;
} catch (err) {
console.error(err);
}
}
async function scrapeDescription(productsWithImages) {
process.setMaxListeners(0);
const browser = await puppeteer.launch({
headless: false
});
const page = await browser.newPage();
await page.goto('https://www.example.com/login');
await page.waitFor(500);
await page.waitFor('input[name="email_address"]');
await page.type('input[name="email_address"]', 'example#gmail.com');
await page.type('input[name="password"]', '123test');
await page.click('#btnLogin');
return await Promise.all(
productsWithImages.map(async job => {
try {
await page.goto(job.url, { waitUntil: "load" });
const content = await page.content();
const $ = await cheerio.load(content);
job.main_img = $('img#main_img').attr('src');
job.name = $('h2').text();
job.price = $("td.products_info_price").text();
return job;
} catch (error) {
console.error(error);
}
})
);
}
async function saveDataToCsv(data) {
const csv = new ObjectsToCsv(data);
console.log(csv);
}
async function scrapeWona() {
const productsWithImages = await scrapeProductPage();
const wonaFullData = await scrapeDescription(productsWithImages);
await saveDataToCsv(productsWithImages);
}
scrapeWona();
The reason you're getting the warning is because of process.setMaxListeners(0)
Indicates you have a memory leak somewhere in the code.
You can take a look at the documentation here also: https://nodejs.org/docs/latest/api/events.html#events_emitter_setmaxlisteners_n
Take a look at the answer from here: node.js - request - How to "emitter.setMaxListeners()"?

Node.js requests and cheerio output blank

I'm learning scraping using node.js requests and cheerio. I write a simple code to display title from a web page.
My code :
const request = require("request");
const cheerio = require("cheerio");
const url = "https://singapore.craigslist.org/d/automotive-services/search/aos"
async function scrapeCraigslist() {
try {
const htmResult = await request.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const title = $(element)
.children(".result-title")
.text();
console.log(title);
console.log("sk");
});
} catch (err) {
console.error(err);
}
}
scrapeCraigslist();
But when i run the code i'm getting blank nothing errors and no ouput.
Output :
Microsoft Windows [Version 10.0.18362.720]
(c) 2019 Microsoft Corporation. All rights reserved.
C:\Users\Ahmed-PC\craigslist>node index.js
C:\Users\Ahmed-PC\craigslist>
My selection and result is coming in Chrome Developer Tools console. but not coming in node.js code
You're using request with a promise style interface, if you wish to do this you'll need to use request-promise (or you could use Axios, node-fetch etc.).
If you use request-promise your code should work fine:
request-promise
const request = require("request");
const cheerio = require("cheerio");
const rp = require("request-promise");
const url = "https://singapore.craigslist.org/d/automotive-services/search/aos"
async function scrapeCraigslist() {
try {
const htmResult = await rp.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const title = $(element)
.children(".result-title")
.text();
console.log(title);
console.log("sk");
});
} catch (err) {
console.error(err);
}
}
scrapeCraigslist();
request (with callback)
const request = require("request");
const cheerio = require("cheerio");
const url = "https://singapore.craigslist.org/d/automotive-services/search/aos"
async function scrapeCraigslist() {
request.get(url, async (error, response, htmResult) => {
if (error) {
// Something went wrong
console.error(error);
} else {
// The request was successful
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const title = $(element)
.children(".result-title")
.text();
console.log(title);
console.log("sk");
});
}
});
}
scrapeCraigslist();

UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'map' of undefined

I'm here doing a scraper using node.js request and request-promise, cheerio.
My code:
const request = require("request");
const cheerio = require("cheerio");
const rp = require("request-promise");
const url = "https://singapore.craigslist.org/d/automotive-services/search/aos"
const scrapeResults = [];
async function scrapeJobHeader() {
try {
const htmResult = await rp.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const resultTitle = $(element).children(".result-title");
title = resultTitle.text();
link = resultTitle.attr("href");
const datePosted = $(element).children("time").attr("datetime");
const scrapResult = {title, link, datePosted};
scrapeResults.push(scrapResult);
return scrapeResults;
});
} catch (err) {
console.error(err);
}
}
async function scrapeDescription(jobWithHeaders) {
return await Promise.all(
jobWithHeaders.map(async job => {
const htmResult = await rp.get(job.url);
const $ = await cheerio.load(htmResult);
$(".print-qrcode-container").remove();
job.description = $("#postingbody").text();
})
);
}
async function scrapeCraigslist() {
const jobWithHeaders = await scrapeJobHeader();
const jobsFullData = await scrapeDescription();
console.log(jobFullData);
}
scrapeCraigslist();
When I run the code I get error like:
C:\Users\Ahmed-PC\craigslist>node index.js
(node:19808) UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'map' of undefined
at scrapeDescription (C:\Users\Ahmed-PC\craigslist\index.js:42:24)
at scrapeCraigslist (C:\Users\Ahmed-PC\craigslist\index.js:62:32)
How I can fix this error and what wrong I'm doing here ?
You're doing this await scrapeDescription();, but you can't call that function without passing it an array.
When you do, then your argument jobWithheaders is undefined and you then try to do undefined.map() which gives you the error you see.
It looks like maybe you just need to change this:
async function scrapeCraigslist() {
const jobWithHeaders = await scrapeJobHeader();
const jobsFullData = await scrapeDescription();
console.log(jobFullData);
}
to this:
async function scrapeCraigslist() {
const jobWithHeaders = await scrapeJobHeader();
const jobsFullData = await scrapeDescription(jobWithHeaders); // <===
console.log(jobFullData);
}
Also, there's no reason to do:
return await Promise.all(...)
Change that to:
return Promise.all(...)
Either way, you're returning a promise that resolves to the same value. Basically, there's never any reason inside an async function to do return await somePromise. Just return the promise directly without the await. All the await does (if not optimized out by the interpreter) is wait for the promise to resolve, get value out of it, then take the promise that was already returned from the async function and make that value the resolved value of that promise. Which gives you the identical result as just returning the promise you already had without the await.
Change this:
const scrapeResults = [];
async function scrapeJobHeader() {
try {
const htmResult = await rp.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const resultTitle = $(element).children(".result-title");
title = resultTitle.text();
link = resultTitle.attr("href");
const datePosted = $(element).children("time").attr("datetime");
const scrapResult = {title, link, datePosted};
scrapeResults.push(scrapResult);
return scrapeResults;
});
} catch (err) {
console.error(err);
}
}
to this:
async function scrapeJobHeader() {
const scrapeResults = [];
const htmResult = await rp.get(url);
const $ = await cheerio.load(htmResult);
$(".result-info").each((index, element) => {
const resultTitle = $(element).children(".result-title");
const title = resultTitle.text();
const link = resultTitle.attr("href");
const datePosted = $(element).children("time").attr("datetime");
const scrapResult = {title, link, datePosted};
scrapeResults.push(scrapResult);
});
return scrapeResults;
}
And, then change this:
scrapeCraigslist();
to this:
scrapeCraigslist().then(results => {
// use the results in here only
console.log(results);
}).catch(err => {
console.log(err);
});
Then, change this:
async function scrapeDescription(jobWithHeaders) {
return await Promise.all(
jobWithHeaders.map(async job => {
const htmResult = await rp.get(job.url);
const $ = await cheerio.load(htmResult);
$(".print-qrcode-container").remove();
job.description = $("#postingbody").text();
})
);
}
to this:
function scrapeDescription(jobWithHeaders) {
return Promise.all(
jobWithHeaders.map(async job => {
const htmResult = await rp.get(job.url);
const $ = await cheerio.load(htmResult);
$(".print-qrcode-container").remove();
job.description = $("#postingbody").text();
return job;
});
);
}

Resources