Nodejs http get and return statusCode - node.js

I want to run through a list of urls and check if they are valid if no 404 status code is returned. My code is as below. But isValid key in my data object doesn't seem to be populated accordingly.
var https = require('https');
const check = async url => {
let result = false;
try {
const request = await https.get(url);
result = request.statusCode !== 404;
} catch (err) {
console.log(err);
}
return result;
};
// and in another function, I am doing this
const data = [];
const urls = [/* urls here */];
urls.forEach(url => {
data.push({
url,
isValid: await isValidUrl(url)
})
});

You are calling the different function, Use Promise.all to get data.
var https = require('https');
const check = async (url) => {
let isValid = false;
try {
const request = await https.get(url);
isValid = request.statusCode !== 404;
} catch (err) {
console.log(err);
}
return {
isValid,
url
};
};
// and in another function, I am doing this
const data = [];
const urls = [ /* urls here */ ];
const result = Promise.all(urls.map(url => check(url)))

Related

Async function to scrape subreddits using Cheerio returns undefined

The script by itself works great (entering the url manually, writing a json file using the fs module, node script_name.js) but within a Express get request it returns undefined.
So I've built a simple frontend to let the user enter the subreddit name to be scraped.
And here's where the problem is:
Express controller
const run = require("../run");
requestPosts: async (req, res) => {
try {
const { subreddit } = req.body;
const response = await run(subreddit);
//console.log(response);
res.json(response);
} catch (error) {
console.error(error);
}
},
Cheerio functions
const axios = require("axios");
const { load } = require("cheerio");
let posts = [];
async function getImage(postLink) {
const { data } = await axios(postLink);
const $ = load(data);
return $("a.post-link").attr("href");
}
async function run(url) {
try {
console.log(url);
const { data } = await axios(url);
const $ = load(data);
$(".thing.linkflair.link").map(async (i, e) => {
const title = $(e)
.find(".entry.unvoted .top-matter .title .title")
.text();
const user = $(e)
.find(".entry.unvoted .top-matter .tagline .author")
.text();
const profileLink = `https://old.reddit.com/user/${user}`;
const postLink = `https://old.reddit.com/${$(e).find("a").attr("href")}`;
// const thumbail = $(e).find("a img").attr("src");
const image = await getImage(postLink);
posts.push({
id: i + 1,
title,
postLink,
image,
user: { user, profileLink },
});
});
const nextPage = $(".next-button a").attr("href");
if (nextPage) {
await run(nextPage);
} else {
return posts;
}
} catch (error) {
console.error(error);
}
}
module.exports = run;
I've tried working with Promise((resolve, reject) => {}).
I think it's returning undefined because maybe the code its not synchronized.
(idk if it makes sense, i've just started programming)
.map() is not promise-aware and does not wait for your promises to finish. So, $(".thing.linkflair.link").map() finishes long before any of the asynchronous functions inside its callback do. Thus you try to return posts BEFORE it has been populated.
Passing an async callback to .map() will return an array of promises. You can use Promise.all() on those promises to know when they are done and once you're doing that, you may as well just return each post object rather that using a higher level scoped/shared object, thus making the code more self contained.
I would suggest this code:
async function run(url) {
try {
console.log(url);
const { data } = await axios(url);
const $ = load(data);
const posts = await Promise.all($(".thing.linkflair.link").map(async (i, e) => {
const title = $(e)
.find(".entry.unvoted .top-matter .title .title")
.text();
const user = $(e)
.find(".entry.unvoted .top-matter .tagline .author")
.text();
const profileLink = `https://old.reddit.com/user/${user}`;
const postLink = `https://old.reddit.com/${$(e).find("a").attr("href")}`;
// const thumbail = $(e).find("a img").attr("src");
const image = await getImage(postLink);
// return a post object
return {
id: i + 1,
title,
postLink,
image,
user: { user, profileLink },
};
}));
const nextPage = $(".next-button a").attr("href");
if (nextPage) {
const newPosts = await run(nextPage);
// add these posts to the ones we already have
posts.push(...newPosts);
}
return posts;
} catch (error) {
console.error(error);
}
}

Cannot get the result of an async function in controller

This is my controller:
const rssService = require('../services/rss.service');
async function parser() {
const result = await rssService.rssParser('someurl');
return result;
};
const parse = async function (req, res) {
const p = new Promise((resolve, reject) => {
const t = parser();
if (t === undefined) {
resolve(t);
} else {
// eslint-disable-next-line prefer-promise-reject-errors
reject('something bad happened');
}
});
p.then((result) => res.send(result)).catch((message) => console.log(`ERROR ${message}`));
};
module.exports = {
parse,
};
in the function : parser() above, I am trying to call my rss.service.js file which I have the logic. This file is a rss parser which tries to parse the feed and do some calculations (which needs promises and async) and then return the json object.
Here is how my rss.service look :
const rssParser = async function parseRssFeed(url) {
const parser = new Parser();
const appRoot = process.env.PWD;
const downloadDir = `${appRoot}/downloads/`;
if (!fs.existsSync(downloadDir)) {
fs.mkdirSync(downloadDir);
}
try {
const feed = await parser.parseURL('someurl');
const processedFeedItems = await Promise.all(feed.items.map(async (currentItem) => {
const {
fileUrl,
downloadPath,
} = await downloadFile(currentItem);
const hashValue = calculateHash(downloadPath)
return {
title: currentItem.title,
hash: hashValue,
url: mp3FileUrl,
};
}));
return (JSON.stringify(processedFeedItems));
} catch (error) {
console.error(error);
return 'error';
}
};
when I debug my code I can verify that Json object has been created with correct data, but the result does not return to the callee(controller).
I'll go in a little deeper since you mentioned you're new:
const rssService = require('../services/rss.service');
// This is an async function (always returns a promise)
async function parser() {
const result = await rssService.rssParser('someurl');
return result;
};
const parse = async function (req, res, next) {
// In await/async, you should use try/catch/throw instead of .then and .catch
// It does the same thing, but it's the preferred syntax and is "easier" to read IMO
// Out in "the community" people will complain if you mix await/async with promises like that
try {
// Adding await to ensure left-assign works.
// This is necessary because parser is an async function (so returns a promise)
const result = await parser();
// here, since you used `await` you get the value instead of the promise
if (result === undefined) throw new Error('something bad happened')
return res.send(result)
} catch (error) {
console.log(`ERROR ${error.message}`;
// Do you need to do anything else with this error? I assume something like:
return next(error);
}
};
module.exports = {
parse,
};
In a fast look, It seems you have forgot to wait for resolve the parser promise.
...
const p = new Promise(async(resolve, reject) => {
const t = await parser();
...

I have a question about crawling. I wanna use the return value out of the funtion

I'm doing crawling with Node.js and I wanna use the return value (which is titleList) out of the function, but that doesn't work out of that function.
Please give me some advice about that.
const axios = require("axios");
const cheerio = require('cheerio');
async function getHTML()
{
try
{
return await axios.get("https://google.com");
} catch (error) {
console.error(error);
}
}
getHTML().then(html =>
{
let titleList = [];
const $ = cheerio.load(html.data);
// bodyList에 저장
const bodyList = $("ul.new_quickMenu_list");
bodyList.find("li").each(function(i, elem)
{
titleList[i] = {
title : $(this).find("span").text()
};
});
//console.log(titleList);
return titleList;
})

Puppeteer wrong result with evaluate() & exposeFunction()

I ran the following and it appears to gather a large number of links, however on actual inspection of the site with collectLinks1 I get all valid links, but with collectLinks2 I got 59 iterations of http://pieroxy.net/blog/2014/11/18/[
I'm new to Puppeteer and I can't find out why with collectLinks2 I don't get the links.
const { parse, resolve } = require('url');
const trim = require('lodash/trim');
const startsWith = require('lodash/startsWith');
const includes = require('lodash/includes');
// https://github.com/GoogleChrome/puppeteer
const puppeteer = require('puppeteer');
// https://github.com/gwuhaolin/chrome-finder
const findChrome = require('chrome-finder');
function resolveUrl(url, baseUrl) {
url = trim(url);
if (!url) return null;
if (startsWith(url, '#')) return null;
const { protocol } = parse(url);
if (includes(['http:', 'https:'], protocol)) {
return url.split('#')[0];
} if (!protocol) {
return resolve(baseUrl, url).split('#')[0];
}
return null;
}
async function collectLinks1(htmlPage) {
const baseUrl = htmlPage.url();
const links = [];
const assetUrls = await htmlPage.$$eval('a[href]', assetLinks => assetLinks.map(link => link.href));
assetUrls.forEach(link => {
const _link = resolveUrl(link, baseUrl);
if (_link) links.push(_link);
});
return links;
}
async function collectLinks2(htmlPage) {
const baseUrl = htmlPage.url();
const links = [];
await htmlPage.exposeFunction('pushToLinks', link => {
const _link = resolveUrl(link, baseUrl);
if (_link) links.push(_link);
});
await htmlPage.evaluate(() => {
function findLinks(document) {
document.querySelectorAll('a[href]')
.forEach(link => {
window.pushToLinks(link.href);
});
}
findLinks(window.document);
});
return links;
}
const crawl = async url => {
try {
console.log(`Crawling ${url}`);
const browser = await puppeteer.launch({
headless: false,
executablePath: findChrome(),
});
const page = await browser.newPage();
await page.goto(url);
// OK
const links1 = await collectLinks1(page);
links1.forEach(link => { console.log(link); });
// KO
const links2 = await collectLinks2(page);
links2.forEach(link => { console.log(link); });
await browser.close();
} catch (err) {
console.log(err);
}
};
crawl('http://pieroxy.net/blog/2014/11/18/user_agent_detection_in_java.html');
You need to await the function defined via page.exposeFunction as it returns a Promise. As you are only calling the function but not awaiting its result, your page.evaluate call will resolve before your script finished executing.
Solution
Instead of the forEach, you should use a loop to iterate over all the items and communicate them to the page one after another.
async function collectLinks2(htmlPage) {
// ...
await htmlPage.evaluate(async () => {
async function findLinks(document) {
for (const link of document.querySelectorAll('a[href]')) {
await window.pushToLinks(link.href);
}
}
await findLinks(window.document);
});
return links;
}

Wait for something in node.js

I'm trying to make a webscraper, but I can't get my function to wait for the second request to fill the name key on my object. It always return undefined.
const request = require('request');
const cheerio = require('cheerio');
const base_url = 'https://www.supremenewyork.com';
const shop_url = 'https://www.supremenewyork.com/shop/';
function getItems(category) {
var items = [];
return new Promise(function(resolve, reject) {
request.get(shop_url + category, function(err, res, body) {
if(err) {
reject(err);
} else {
var $ = cheerio.load(body);
$('a', '.inner-article').each(function(i, el) {
var url = base_url + $(this).attr('href');
var isSoldout = false;
var name;
if($(this).find('div').attr('class', 'sold_out_tag').length === 1)
isSoldout = true;
request.get(url, function(err, res, html) {
var $ = cheerio.load(html);
name = $('h1', 'div').text();
})
items.push({name: name, url: url, isSoldout: isSoldout});
})
resolve(items);
}
})
})
}
I expect the name key to be fill but no, i get undefined
Use the request-promise package which wraps request in Promise. Then you can use async/await to wait for result like:
const rp = require('request-promise');
const cheerio = require('cheerio');
const base_url = 'https://www.supremenewyork.com';
const shop_url = 'https://www.supremenewyork.com/shop/';
// notice async keyword
async function getItems(category) {
var items = [];
try {
// using await to wait for promise to resolve
const body = await rp.get(shop_url + category)
var $ = cheerio.load(body);
$('a', '.inner-article').each(function(i, el) {
var url = base_url + $(this).attr('href');
var isSoldout = false;
var name;
if($(this).find('div').attr('class', 'sold_out_tag').length === 1)
isSoldout = true;
try {
const html = await rp.get(url)
var $ = cheerio.load(html);
name = $('h1', 'div').text();
items.push({name: name, url: url, isSoldout: isSoldout});
} catch (err) {
throw err;
}
})
} catch (e) {
throw e;
}
return items;
}
More about async/await at MDN

Resources