I have an array of urls, I want to fetch every element of that array in order, but when I run the code I got the incorrect order, and some elements is missing
const fetch = require('node-fetch');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
var chocolateList = [];
urlArray = [...] // 95 elements
urlArray.forEach((url, index) => {
fetch(url).then(res => res.text())
.then(async (success) => {
const dom = new JSDOM(success); // convert response into DOM
dom.window.document.querySelectorAll('h1')
.forEach((htmlH1, i) => { // get all h1 html tags
if (htmlH1.includes('chocolate')) {
chocolateList.push({name:htmlH1});
}
});
console.log(chocolateList)
})
});
```
outputs an array of approximately 20 elements instead of 95 elements
what im doing wrong?
Try this code, it works. You made a few mistakes in the above, I explained it in the code comments section
const fetch = require('node-fetch');
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
var chocolateList = [];
urlArray = ['https://www.example.com', 'https://www.example2.com'];
urlArray.forEach((url) => {
fetch(url).then(res => res.text())
.then(async (success) => {
const dom = new JSDOM(success);
//h1 tags array length
let len = dom.window.document.querySelectorAll('h1').length;
//h1 tags array
let h1Arr = dom.window.document.querySelectorAll('h1');
// Check for a length of an h1 tags array
if (len) {
h1Arr.forEach((htmlH1, i) => {
let h1Text = htmlH1.textContent;
//The content that you are recieving is with lot of new-line and carriage return characters.
//So, you should always sanitize the data before proceeding, and covert it to lower case as includes search is a case sensitive search
h1Text = h1Text.replace(/[\r\n]/g, '').trim().toLowerCase();
if (h1Text.includes('chocolate')) {
chocolateList.push({ name: htmlH1 });
}
});
} else {
console.log('No h1 element present on DOM');
}
console.log(chocolateList);
})
});
Related
As an exercise, I'm creating a simple API that allows users to provide a search term to retrieve links to appropriate news articles across a collection of resources. The relevent function and the route handler that uses the function is as follows:
function GetArticles(searchTerm) {
const articles = [];
//Loop through each resource
resources.forEach(async resource => {
const result = await axios.get(resource.address);
const html = result.data;
//Use Cheerio: load the html document and create Cheerio selector/API
const $ = cheerio.load(html);
//Filter html to retrieve appropriate links
$(`a:contains(${searchTerm})`, html).each((i, el) => {
const title = $(el).text();
let url = $(el).attr('href');
articles.push(
{
title: title,
url: url,
source: resource.name
}
);
})
})
return articles; //Empty array is returned
}
And the route handler that uses the function:
app.get('/news/:searchTerm', async (req, res) => {
const searchTerm = req.params.searchTerm;
const articles = await GetArticles(searchTerm);
res.json(articles);
})
The problem I'm getting is that the returned "articles" array is empty. However, if I'm not "looping over each resource" as commented in the beginning of GetArticles, but instead perform the main logic on just a single "resource", "articles" is returned with the requested data and is not empty. In other words, if the function is the following:
async function GetArticles(searchTerm) {
const articles = [];
const result = await axios.get(resources[0].address);
const html = result.data;
const $ = cheerio.load(html);
$(`a:contains(${searchTerm})`, html).each((i, el) => {
const title = $(el).text();
let url = $(el).attr('href');
articles.push(
{
title: title,
url: url,
source: resources[0].name
}
);
})
return articles; //Populated array
}
Then "articles" is not empty, as intended.
I'm sure this has to do with how I'm dealing with the asynchronous nature of the code. I've tried refreshing my knowledge of asynchronous programming in JS but I still can't quite fix the function. Clearly, the "articles" array is being returned before it's populated, but how?
Could someone please help explain why my GetArticles function works with a single "resource" but not when looping over an array of "resources"?
Try this
function GetArticles(searchTerm) {
return Promise.all(resources.map(resource => axios.get(resource.address))
.then(responses => responses.flatMap(result => {
const html = result.data;
//Use Cheerio: load the html document and create Cheerio selector/API
const $ = cheerio.load(html);
let articles = []
//Filter html to retrieve appropriate links
$(`a:contains(${searchTerm})`, html).each((i, el) => {
const title = $(el).text();
let url = $(el).attr('href');
articles.push(
{
title: title,
url: url,
source: resource.name
}
);
})
return articles;
}))
}
The problem in your implementation was here
resources.forEach(async resource...
You have defined your function async but when result.foreach get executed and launch your async functions it doesn't wait.
So your array will always be empty.
I'm doing webscarping and writing the data to another HTML file.
On line " const content = await page.$eval('.eApVPN', e => e.innerHTML);" I'm fetching the inner html of a div, this div has multiple p tag, inside those p tags there are multiple hyperlink(a) tags
I want to remove those tags href, but I'm unable to do so
const fs = require('fs').promises;
const helps = require('./_helpers');
const OUTDIR = './results/dataset/'
fs.stat(OUTDIR).catch(async (err) => {
if (err.message.includes('Result Director Doesnt Exist')) {
await fs.mkdir(OUTDIR);
}
await fs.mkdir(OUTDIR);
});
const scraperObject = {
async scraper(browser){
const dataSet = await helps.readCSV('./results/dataset.csv');
console.log("dataset is : ", dataset);
var cookies = null
let page = await browser.newPage();
for (let i = 0; i < dataSet.length ; i++) {
let url = dataSet[i].coinPage
const filename = dataSet[i].symbol;
try{
console.log(`Navigating to ${url}...`);
await page.goto(url);
if (cookies == null){
cookies = await page.cookies();
await fs.writeFile('./storage/cookies', JSON.stringify(cookies, null, 2));
}
await helps.autoScroll(page);
await page.waitForSelector('.eApVPN');
const content = await page.$eval('.eApVPN', e => e.innerHTML);
await fs.writeFile(`${OUTDIR}${filename}.html`, content, (error) => { console.log(error); });
console.log("Written to HTML successfully!");
} catch (err){
console.log(err, '------->', dataSet[i].symbol);
}
}
await page.close();
}
}
module.exports = scraperObject;
Unfortunately Puppeteer doesn't have native functionality to remove nodes. However, you can use .evaluate method to evaluate any javascript script against the current document. For example a script which removes your nodes would look something like this:
await page.evaluate((sel) => {
var elements = document.querySelectorAll(sel);
for(var i=0; i< elements.length; i++){
elements[i].remove()
}
}, ".eApVPN>a")
The above code will remove any <a> nodes directly under a node with eApVPN class. Then you can extract the data with your $eval selector.
For very large JSON/GeoJSON files, I'd like to create a primitive key/value store that keeps track of the starting positions and lengths of each JSON record in the file. This way, I could look up individual records at a later stage without reading the whole file into memory (Using the fd.read API). Somewhat similar to a super simple database, but read-only and without the extra overhead.
The issue I'm facing is that I don't know how I could determine the starting position and byte length of each JSON record / GeoJSON feature in the original file.
Here's some pseudo-code showcasing what I'm trying to achieve, note that the geojsonStream.parse callback doesn't receive the startByte and length arguments in reality though.
Thanks for your help, also happy about any feedback outlining why this might be a bad idea :)
import geojsonStream from 'geojson-stream'
import { open } from 'fs/promises'
import { Buffer } from 'buffer'
function getFeaturePositionsInFile(fd) {
return new Promise((resolve,reject) => {
const featurePositionsInFile = []
const stream = fd
.createReadStream()
.pipe(geojsonStream.parse((building, index, startByte, length) => {
// The startByte and length callback arguments are not real unfortunately :(
featurePositionsInFile.push({
index,
startPosition,
length
})
}))
stream.on('end', () => resolve(featurePositionsInFile))
stream.on('error', () => reject)
})
}
function readSingleFeatureFromFile(fd, startPosition, length) {
return new Promise((resolve, reject) => {
try {
const buff = Buffer.alloc(length)
const offset = 0
const { buffer } = await fd.read(buff, offset, length, startPosition)
const singleFeature = JSON.parse(buffer.toString())
resolve(singleFeature)
} catch (e) {
reject(e)
}
})
}
const fd = await open('buildings.geojson')
const featurePositionsInFile = await getFeaturePositionsInFile(fd)
const featureIndexToRead = 0
const { startPosition, length } = featurePositionsInFile[featureIndexToRead]
const singleFeature = await readSingleFeatureFromFile(fd, startPosition, length)
Alright, since I couldn't find a suitable package for my needs, I created a simple (naïve) solution using RegExp to extract single GeoJSON features.
It works given:
The GeoJSON has a properties object, and the object is the last key in the parent GeoJSON object
the GeoJSON (properties) solely consists ASCII characters
For GeoJSON files containing non-ASCII characters, the byte counting is off. I tried but couldn't really find out what exactly I'm doing wrong, so any help is appreciated!
For a more general solution, I guess one would need to implement the byte counting logic in an existing library such as stream-json
import { open } from 'fs/promises'
import { Buffer } from 'buffer'
const HIGHWATERMARK = 64 * 1024 / 8
function getFeaturePositionsInFile(fd) {
return new Promise((resolve,reject) => {
const featurePositionsInFile = []
const stream = fd.createReadStream({highWaterMark: HIGHWATERMARK, autoClose: false});
// this RegEx will solely work with standard GeoJSON without any foreign members:
// https://datatracker.ietf.org/doc/html/rfc7946#section-6.1
// The properties object has to be present, and has to be that last key in the GeoJSON object
const jsonExtractor = /\{[\n\r\s]*?"type":[\n\r\s]*?"Feature"[\S\s]*?\}(?:[\n\r\s]*\})+/g
let string = ''
let endPos = 0
stream.on('data', (d) => {
const section = d.toString()
const sectionLength = (new TextEncoder().encode(section)).length
string += section
endPos+= sectionLength
let match
let latestEndPositionInString = 0
while ((match = jsonExtractor.exec(string)) != null) {
const startPositionInString = match.index
const featureString = match[0]
const endPositionInString = startPositionInString + featureString.length
const curStringLength = (new TextEncoder().encode(string)).length
// calculate starting position in file
const startPosition = endPos - curStringLength + startPositionInString
// calculate number of bytes in feature
const byteLength = (new TextEncoder().encode(featureString)).length
// store info for later in our lookup array
featurePositionsInFile.push({
startPosition,
byteLength
})
if (endPositionInString > latestEndPositionInString) {
latestEndPositionInString = endPositionInString
}
}
// remove features from string to free memory
string = string.substring(latestEndPositionInString)
})
stream.on('end', () => resolve(featurePositionsInFile))
stream.on('error', () => reject)
})
}
function readSingleFeatureFromFile(fd, startPosition, length) {
return new Promise(async (resolve, reject) => {
try {
const buff = Buffer.alloc(length)
const offset = 0
const { buffer } = await fd.read(buff, offset, length, startPosition)
const featureString = buffer.toString()
const singleFeature = JSON.parse(featureString)
resolve(singleFeature)
} catch (e) {
reject(e)
}
})
}
async function getFeature(featureIndexToRead, featurePositionsInFile) {
const { startPosition, byteLength } = featurePositionsInFile[featureIndexToRead]
const singleFeature = await readSingleFeatureFromFile(fd, startPosition, byteLength)
return singleFeature
}
// source: https://raw.githubusercontent.com/node-geojson/geojson-stream/master/test/data/featurecollection.geojson
const path = 'featurecollection.geojson'
// -> has 3 features
const fd = await open(path, 'r');
const featurePositionsInFile = await getFeaturePositionsInFile(fd)
// get nth (e.g 3rd) feature in file
const firstFeature = await getFeature(2, featurePositionsInFile)
console.log(firstFeature)
// done! make sure to close the filehandle
fd.close()
https://gist.github.com/chrispahm/c226cca151b25147869288600151a5f8
I'm building a simple NodeJS web scraper, and I want to re-run the function like a 'for loop' until pageNum = totalNumberOfPages... im having a brain fart, and unable to re-run the function from inside itself, since it returns an array fragment and kills itself. Could someone help me overcome this obstacle? I'm pretty sure it's very simple.
I looked at this and this but didn't figure it out...
const cheerio = require("cheerio");
const axios = require("axios");
let pageNum = 0;
let siteUrl = "https://whatever.com?&page=" + pageNum + "&viewAll=true";
let productArray = [];
let vendor = [];
let productTitle = [];
let plantType = [];
let thcRange = [];
let cbdRange = [];
let price = [];
let totalNumberOfPages = undefined;
// called by getResults()
const fetchData = async () => {
const result = await axios.get(siteUrl);
return cheerio.load(result.data);
};
// this function is called from index.js
const getResults = async () => {
// >>>>>>>>>>>>>>>>>> HOW DO I RERUN FROM HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<
const $ = await fetchData();
// first check how many total pages there are
totalNumberOfPages = parseInt($('.pagination li:nth-last-child(2)').text());
// use fetched data to grab elements (and their text) and push into arrays defined above
$('.product-tile__vendor').each((index, element) => {
vendor.push($(element).text());
});
$('.product-tile__title').each((index, element) => {
productTitle.push($(element).text());
});
$('.product-tile__plant-type').each((index, element) => {
plantType.push($(element).text());
});
$('.product-tile__properties li:nth-child(2) p').each((index, element) => {
thcRange.push($(element).text());
});
$('.product-tile__properties li:nth-child(3) p').each((index, element) => {
cbdRange.push($(element).text());
});
$('.product-tile__price').each((index, element) => {
price.push($(element).text());
});
// increment page number to get more products if the page count is less than total number of pages
if (pageNum < totalNumberOfPages) {
pageNum ++;
};
//Convert to an array so that we can sort the results.
productArray.push ({
vendors: [...vendor],
productTitle: [...productTitle],
plantType: [...plantType],
thcRange: [...thcRange],
cbdRange: [...cbdRange],
price: [...price],
pageNum
});
// >>>>>>>>>>>>>>>>>> UNTIL HERE I THINK <<<<<<<<<<<<<<<<<<<<<<<<<<<
return productArray;
};
module.exports = getResults;
you can use recursion concept in your code:
which means the function itself will call itself
so what you can do is
const getResults = async () => {
// >>>>>>>>>>>>>>>>>> HOW DO I RERUN FROM HERE <<<<<<<<<<<<<<<<<<<<<<<<<<<
const $ = await fetchData();
// first check how many total pages there are
totalNumberOfPages = parseInt($('.pagination li:nth-last-child(2)').text());
// use fetched data to grab elements (and their text) and push into arrays defined above
$('.product-tile__vendor').each((index, element) => {
vendor.push($(element).text());
});
$('.product-tile__title').each((index, element) => {
productTitle.push($(element).text());
});
$('.product-tile__plant-type').each((index, element) => {
plantType.push($(element).text());
});
$('.product-tile__properties li:nth-child(2) p').each((index, element) => {
thcRange.push($(element).text());
});
$('.product-tile__properties li:nth-child(3) p').each((index, element) => {
cbdRange.push($(element).text());
});
$('.product-tile__price').each((index, element) => {
price.push($(element).text());
});
// increment page number to get more products if the page count is less than total number of pages
if (pageNum < totalNumberOfPages) {
pageNum ++;
};
//Convert to an array so that we can sort the results.
productArray.push ({
vendors: [...vendor],
productTitle: [...productTitle],
plantType: [...plantType],
thcRange: [...thcRange],
cbdRange: [...cbdRange],
price: [...price],
pageNum
});
// >>>>>>>>>>>>>>>>>> UNTIL HERE I THINK <<<<<<<<<<<<<<<<<<<<<<<<<<<
if(pageNum >= totalNumberOfPages) getResults()
return productArray;
};
I'm having trouble with error handling with my function in my bot for Discord. What I've got right now is a command that scraps information from a website, I want to make it so if there is an error (404), the user will get some feedback. How would I go about doing this? Right now I currently have something, but I'm not sure what I'm doing wrong. Here is a snippet of code:
//modules used
const rp = require('request-promise-native');
const errors = require('request-promise/errors');
const cheerio = require('cheerio');
if (message.content.startsWith(prefix + 'latest')) {
//website url variables
const website_domain = "https://hypebeast.com/";
let website_path = args[0];
let website_url = website_domain + website_path;
//extra arguments variable
let extra_arg = args.slice(1).join(" ");
if (extra_arg.length > 0) {
message.reply('too many arguments! Please refer to `h.help` for correct usage.');
} else {
//opening url and scrapping elements
function scrapData(website_url) {
return rp(website_url)
.then(body => {
let items = [],
$ = cheerio.load(body).catch(errors.StatusCodeError, function (reason) {
console.log(reason);
});
//web scrapping here
$('.post-box').each(function() {
let title = $(this).find($('.title h2 span')).first().text(),
caption = $(this).find($('.post-box-excerpt p')).first().text(),
article_url = $(this).find($('.col-hb-post-image a')).first().attr('href'),
thumbnail_long = $(this).find($('.thumbnail img')).first().attr('src');
//adding title, caption, etc to list
items.push({title, caption, article_url, thumbnail_long});
//check items in console
console.log(items);
})
return items;
})
}
I have just modified your code little try this below code.
//modules used
const rp = require('request-promise-native');
const errors = require('request-promise/errors');
const cheerio = require('cheerio');
if (message.content.startsWith(prefix + 'latest')) {
//website url variables
const website_domain = "https://hypebeast.com/";
let website_path = args[0];
let website_url = website_domain + website_path;
//extra arguments variable
let extra_arg = args.slice(1).join(" ");
if (extra_arg.length > 0) {
message.reply('too many arguments! Please refer to `h.help` for correct usage.');
} else {
var options = {
uri: website_url,
transform: function (body) {
return cheerio.load(body);
}
};
rp(options)
.then(function ($) {
// Process html like you would with jQuery...
$('.post-box').each(function() {
let title = $(this).find($('.title h2 span')).first().text(),
caption = $(this).find($('.post-box-excerpt p')).first().text(),
article_url = $(this).find($('.col-hb-post-image a')).first().attr('href'),
thumbnail_long = $(this).find($('.thumbnail img')).first().attr('src');
//adding title, caption, etc to list
items.push({title, caption, article_url, thumbnail_long});
//check items in console
console.log(items);
});
})
.catch(function (err) {
console.log(err);
});
}