Unable to make use of links to fetch different titles - node.js

I've created a script in node using promise in combination with request and cheerio to parse the links under Province column from this webpage then reuse those links to scrape all the urls under Office column from all of such pages and finally make use these links to collect the title from all of such target pages, as in Cairos main Post Office in this page.
My current script most of the times gets stuck. However, sometimes it throws this error UnhandledPromiseRejectionWarning: TypeError: Cannot read property 'parent' of undefined. I've checked each of the functions and found that they are all working in the right way individually.
Although the script looks a bit bigger, it is built upon a very simple logic which is make use of each links from it's landing page until it reaches the title of it's target page.
This is my try so far:
const request = require('request');
const cheerio = require('cheerio');
const link = 'https://www.egyptcodebase.com/en/p/all';
const base_link = 'https://www.egyptcodebase.com/en/';
const items = [];
const nitems = [];
let getLinks = () => {
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
items.push(base_link + $(this).find("a").attr("href"));
});
resolve(items);
} catch (e) {
reject(e);
}
});
});
};
let getData = (links) => {
const promises = links
.map(nurl => new Promise((resolve, reject) => {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
$('.table tbody tr').each(function() {
nitems.push(base_link + $(this).find("a").attr("href"));
});
resolve(nitems);
} catch (e) {
reject(e);
}
})
}))
return Promise.all(promises)
}
let FetchData = (links) => {
const promises = links
.map(nurl => new Promise((resolve, reject) => {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
if (error) return reject(error);
try {
resolve($(".home-title > h2").eq(0).text());
} catch (e) {
reject(e);
}
})
}))
return Promise.all(promises)
}
getLinks().then(resultList => {
getData(resultList).then(resultSet => {
FetchData(resultSet).then(title =>{
console.log(title);
})
})
})
How can I scrape the titles from target pages making use of all the links from landing pages?

It would be much easier to ask the website Owner about the data which you need.
He might understand your request and give it to you for free, instead of scraping his site.
P.S: I was surprised to find a question about how to scrape my own website.
P.S2: If you just need all post office titles I could have given it for you for free :D
P.S3: Your error is maybe happening because of some time the page doesn't have the element which you are trying to parse using cheerio.

So the issue is with 2D array. If you go through carefully over your getData function, you're returning 2D array.
map return an array and within that map you're resolving another array nitems.
Here's the working code:
const base_link = 'https://www.egyptcodebase.com/en/';
// helper wrapper DRY
const getHtmls = (url) => {
return new Promise((resolve, reject) => {
request({ uri: url, method: 'GET', followAllRedirects: true } , function(error, response, html) {
if (error) reject(error);
else resolve(html);
});
})
}
let getLinks = async () => {
const link = 'https://www.egyptcodebase.com/en/p/all';
const items = [];
try {
const html = await getHtmls(link);
let $ = cheerio.load(html);
$('.table tbody tr').each(function() {
items.push(base_link + $(this).find("a").attr("href"));
});
} catch (e) {
// handling error here so execution can continue for good eggs
console.error(e.message)
}
return items;
};
let getData = async (links) => {
const out = [];
try {
const promises = links.map(nurl => getHtmls(nurl));
const htmls = await Promise.all(promises);
htmls.forEach(html => {
let $ = cheerio.load(html);
$('.table tbody tr').each(function() {
out.push(base_link + $(this).find("a").attr("href"));
});
})
} catch (e) {
// handling error here so execution can continue for good eggs
console.error(e.message)
}
return out;
}
let FetchData = async (links) => {
const out = [];
try {
const promises = links.map(nurl => getHtmls(nurl));
const htmls = await Promise.all(promises)
htmls.forEach(html => {
try {
let $ = cheerio.load(html);
out.push($(".home-title > h2").eq(0).text());
} catch (e){
// handling error here so execution can continue for good eggs
console.error(e.message)
}
})
} catch (e) {
// handling error here so execution can continue for good eggs
console.error(e.message)
}
return out;
}
getLinks().then(resultList => {
getData(resultList).then(resultSet => {
FetchData(resultSet).then(title =>{
console.log(title);
})
})
})
Note: Instead of writing your own Promise wrapper, you could use request-promise package

Issue with your code is in FetchData function, as in that function you are passing links and then using map over it.
But if you look inside that map function and check the value of 'nurl' variable it will be an array of links and its data type would be object.
According to the semantics of request function, its first param should be string, so if you iterate over the 'nurl' variable to get the values, then it would work.
My code snippet for one url from array

Related

Unable to map inside an async function

In the below code, I am fetching data from an external api. After parsing the data as json, I wanted to map through it and get a modified version.
For some reason, the console.log(jsonData) inside the map function is not getting executed. Please check the code below for clarity
const getRandomOutfit = async (req, res) => {
const { gender, countryCode } = req.params;
if (req.params.gender === "FEMALE" || req.params.gender === "MALE") {
try {
const response = await fetch(URL);
const jsonData = await response.json();
const outputData = jsonData.map((productItem) => {
console.log(productItem); // doesn't get printed
// some operation
return productItem;
});
await res.json(jsonData);
} catch (error) {
res.status(500).send("Error getting data");
}
} else {
res.status(500).send("Invalid category");
}
};
I'm confused about what I am missing here and making an error.
I rewrote the code to make it clearer to understand. In general, it is best to take the fail first approach. Notice, how the first thing I do is return upon failure.
As to why you code is not printing anything out, try printing jsonData. It might be that this is an empty array.
const getRandomOutfit = async (req, res) => {
const { gender, countryCode } = req.params;
if (gender !== "FEMALE" && gender !== "MALE")
return res.status(500).send("Invalid category");
try {
const response = await fetch(URL);
const jsonData = await response.json();
console.log(jsonData); // what does this return?
const outputData = jsonData.map((productItem) => {
console.log(productItem); // doesn't get printed
// some operation
return productItem;
});
await res.json(jsonData);
} catch (error) {
res.status(500).send("Error getting data");
};

Promise.all returning undefined in Node JS

I have a code to fetch directory names from first API. For every directory, need to get the file name from a second API. I am using something like this in my Node JS code -
async function main_function(req, res) {
const response = await fetch(...)
.then((response) => {
if (response.ok) {
return response.text();
} else {
return "";
}
})
.then((data) => {
dirs = ...some logic to extract number of directories...
const tempPromises = [];
for (i = 0; i < dirs.length; i++) {
tempPromises.push(getFilename(i));
}
console.log(tempPromises); // Prints [ Promise { <pending> } ]
Promise.all(tempPromises).then((result_new) => {
console.log(result_new); // This prints "undefined"
res.send({ status: "ok" });
});
});
}
async function getFilename(inp_a) {
const response = await fetch(...)
.then((response) => {
if (response.ok) {
return response.text();
} else {
return "";
}
})
.then((data) => {
return new Promise((resolve) => {
resolve("Temp Name");
});
});
}
What am I missing here?
Your getFilename() doesn't seem to be returning anything i.e it's returning undefined. Try returning response at the end of the function,
async function getFilename(inp_a) {
const response = ...
return response;
}
Thanks to Mat J for the comment. I was able to simplify my code and also learn when no to use chaining.
Also thanks to Shadab's answer which helped me know that async function always returns a promise and it was that default promise being returned and not the actual string. Wasn't aware of that. (I am pretty new to JS)
Here's my final code/logic which works -
async function main_function(req,res){
try{
const response = await fetch(...)
const resp = await response.text();
dirs = ...some logic to extract number of directories...
const tempPromises = [];
for (i = 0; i < dirs.length; i++) {
tempPromises.push(getFilename(i));
}
Promise.all(tempPromises).then((result_new) => {
console.log(result_new);
res.send({ status: "ok" });
});
}
catch(err){
console.log(err)
res.send({"status" : "error"})
}
}
async function getFilename(inp_a) {
const response = await fetch(...)
respText = await response.text();
return("Temp Name"); //
}

AsyncStorage before list render React Native

I have an app that list incidents of a animals charity ong.
After the login, the ong is directed to your incidents list, so i must pass your ID of login page to load this list.
I'm trying to load the variables of AsyncStorage in a function and then pass it to a callback that load the incidents list in React useEffect.
Code:
export default function Incidents() {
const [incidents, setIncidents] = useState([]);
const [total, setTotal] = useState(0);
const [page, setPage] = useState(1);
const [loading, setLoading] = useState(false);
const [id, setID] = useState('');
const [name, setName] = useState('');
loadStorage:
loadStorage = async function(callback) {
try {
const ongid = await AsyncStorage.getItem('ongID');
const ongname = await AsyncStorage.getItem('ongName');
if (ongid && ongname !== null) {
setID(ongid);
setName(ongname);
}
} catch (e) {
alert(e);
}
callback();
}
loadIncidents:
loadIncidents = async function() {
if (loading) {
return;
}
if (total > 0 && incidents.lenght === total) {
return;
}
try {
const response = await api.get('profiles', {
headers: {
Authorization: id,
},
params: { page },
});
setIncidents([ ... incidents, ... response.data]);
setTotal(response.headers['x-total-count']);
setPage(page +1);
setLoading(false);
} catch (e) {
alert(e); //When i access the incident list page i got the error: 'Error: Request failed with status code 400'
}
}
useEffect:
useEffect(() => {
navigation.addListener('focus', () => {
loadStorage(loadIncidents);
});
}, []);
The error in the alert (e) line of loadIncidents happens because useEffect is not fully calling the loadStorage (loadIncidents) part the first time I enter the application.
I have another page called newIncident, if I navigate to it after pressing OK on this error and go back to the incident list page (by navite.goBack ()) the list will be loaded with all the incidents from the logged ONG.
Need this behavior when I first enter the page that lists the ONG's incidents. Since the both methods are asynchronous, i'm not be able to figure out how to do this.
Maybe you can consider using react-native-easy-app. After completing the initial binding, it can enable you to access the properties in AsyncStorage synchronously in the form of assignment and value, because it is simple enough, so you will not appear above Various problems.
Did it using promises. code below:
async function loadStorage() {
var ongid = '';
var ongname = '';
try {
ongid = await AsyncStorage.getItem('ongID');
ongname = await AsyncStorage.getItem('ongName');
} catch (e) {
alert(e);
}
return new Promise((resolve, reject) => {
if (ongid !== null) {
setID(ongid);
setName(ongname);
const ong = { 'name': ongname, 'id': ongid}
return resolve(ong);
}
})
}
Then in useEffect information loads like this:
loadStorage()
.then(loadIncidents)

I have a question about crawling. I wanna use the return value out of the funtion

I'm doing crawling with Node.js and I wanna use the return value (which is titleList) out of the function, but that doesn't work out of that function.
Please give me some advice about that.
const axios = require("axios");
const cheerio = require('cheerio');
async function getHTML()
{
try
{
return await axios.get("https://google.com");
} catch (error) {
console.error(error);
}
}
getHTML().then(html =>
{
let titleList = [];
const $ = cheerio.load(html.data);
// bodyList에 저장
const bodyList = $("ul.new_quickMenu_list");
bodyList.find("li").each(function(i, elem)
{
titleList[i] = {
title : $(this).find("span").text()
};
});
//console.log(titleList);
return titleList;
})

Trouble returning result when I make use of promise

I've created a script in node using promise in combination with cheerio to parse the links to the titles of different posts from a website and then scrape the title of each post from it's inner page by reusing those links.
My current script can fetch them accordingly If I uncomment this line console.log($("h1 > a").eq(0).text()); within getData. However, it appears that the second function still doesn't return anything.
How can I make the script run successfully the way it is now?
I've written so far:
const request = require('request');
const cheerio = require('cheerio');
const link = 'https://stackoverflow.com/questions/tagged/web-scraping';
const base_link = 'https://stackoverflow.com';
const items = [];
const titles = [];
let getLinks = () => {
return new Promise((resolve, reject) => {
request(link, function(error, response, html) {
let $ = cheerio.load(html);
$('.summary').each(function() {
items.push(base_link + $(this).find(".question-hyperlink").attr("href"));
});
resolve(items);
});
});
};
let getData = (links) => {
return new Promise((resolve, reject) => {
for (let nurl of links) {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
titles.push($("h1 > a").eq(0).text())
// console.log($("h1 > a").eq(0).text());
});
resolve(titles);
}
});
};
getLinks().then((resultList) => {
return getData(resultList)
})
Upon executiing the above script, I get no result, no error either.
You problem is that request is async so you will see the console logs when the callbacks are executed.
However, you are resolving the promise in the first iteration of your for loop. Therefore you return an empty array.
You would need to resolve the promise only when the last request has completed:
let getData = (links) => {
return new Promise((resolve, reject) => {
let count = 0
for (let nurl of links) {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
titles.push($("h1 > a").eq(0).text())
count++ // increment count
if (count === links.length) {
resolve(titles); // resolve if last request to complete
}
});
}
});
};
Alternatively, you could try wrapping each request in a promise and then using Promise.all(), which resolves with an array of results when all promises have completed:
let getData = (links) => {
const promises = links
.map(nurl => new Promise((resolve, reject) => {
request(nurl, function(error, response, html) {
let $ = cheerio.load(html);
resolve($("h1 > a").eq(0).text())
})
}))
return Promise.all(promises)
}

Resources