Asynchronous function in Node.js API not working as intended - node.js

As an exercise, I'm creating a simple API that allows users to provide a search term to retrieve links to appropriate news articles across a collection of resources. The relevent function and the route handler that uses the function is as follows:
function GetArticles(searchTerm) {
const articles = [];
//Loop through each resource
resources.forEach(async resource => {
const result = await axios.get(resource.address);
const html = result.data;
//Use Cheerio: load the html document and create Cheerio selector/API
const $ = cheerio.load(html);
//Filter html to retrieve appropriate links
$(`a:contains(${searchTerm})`, html).each((i, el) => {
const title = $(el).text();
let url = $(el).attr('href');
articles.push(
{
title: title,
url: url,
source: resource.name
}
);
})
})
return articles; //Empty array is returned
}
And the route handler that uses the function:
app.get('/news/:searchTerm', async (req, res) => {
const searchTerm = req.params.searchTerm;
const articles = await GetArticles(searchTerm);
res.json(articles);
})
The problem I'm getting is that the returned "articles" array is empty. However, if I'm not "looping over each resource" as commented in the beginning of GetArticles, but instead perform the main logic on just a single "resource", "articles" is returned with the requested data and is not empty. In other words, if the function is the following:
async function GetArticles(searchTerm) {
const articles = [];
const result = await axios.get(resources[0].address);
const html = result.data;
const $ = cheerio.load(html);
$(`a:contains(${searchTerm})`, html).each((i, el) => {
const title = $(el).text();
let url = $(el).attr('href');
articles.push(
{
title: title,
url: url,
source: resources[0].name
}
);
})
return articles; //Populated array
}
Then "articles" is not empty, as intended.
I'm sure this has to do with how I'm dealing with the asynchronous nature of the code. I've tried refreshing my knowledge of asynchronous programming in JS but I still can't quite fix the function. Clearly, the "articles" array is being returned before it's populated, but how?
Could someone please help explain why my GetArticles function works with a single "resource" but not when looping over an array of "resources"?

Try this
function GetArticles(searchTerm) {
return Promise.all(resources.map(resource => axios.get(resource.address))
.then(responses => responses.flatMap(result => {
const html = result.data;
//Use Cheerio: load the html document and create Cheerio selector/API
const $ = cheerio.load(html);
let articles = []
//Filter html to retrieve appropriate links
$(`a:contains(${searchTerm})`, html).each((i, el) => {
const title = $(el).text();
let url = $(el).attr('href');
articles.push(
{
title: title,
url: url,
source: resource.name
}
);
})
return articles;
}))
}
The problem in your implementation was here
resources.forEach(async resource...
You have defined your function async but when result.foreach get executed and launch your async functions it doesn't wait.
So your array will always be empty.

Related

How to parse XML feed URL and store items in Firestore using cloud functions?

I have been given an assignment to fetch a JSON API, and also parse an XML feed URL and store their responses inside separate Firestore collections. I am not really good at cloud functions, but after lots of research, I have written the cloud function code below for the JSON API and it works well.
const functions = require("firebase-functions");
const axios = require("axios");
const admin = require("firebase-admin");
const api_token = "XXXXXXX";
const includes = "XXXXXX";
const url = "https://XXXXXXXXXXXXXX.com/?api_token=" + api_token + includes;
exports.allLeagues = functions.region('europe-west1').https.onRequest(async (req, res) => {
try {
let response = await axios.get(url);
var data = response.data.data;
for (let leagueData of data) {
await admin.firestore().collection("leagues").doc(leagueData.id.toString()).collection("all_data").doc(leagueData.id.toString()).set({
id : leagueData.id,
name : leagueData.name,
logo_path : leagueData.logo_path,
is_cup : leagueData.is_cup
});
}
console.log("Table complete...");
console.log("successful");
return res.status(200).json({ message: "successful" });
} catch(error) {
console.log("Error encountered: "+error);
return res.status(500).json({ error });
}
});
I am through with the JSON API. But for the XML feed, I don't know where to start. I have done lots of research to no avail. I found this on Stackoverflow but it doesn't address my need. Assuming this is my feed: https://www.feedforall.com/sample.xml , please how do I parse it and save the items inside Firestore?
Kindly help.
Thank you.
You can use rss-parser that can be used to fetch data from RSS feeds or parse from XML strings as shown below:
// npm install rss-parser
const Parser = require("rss-parser");
const parser = new Parser();
exports.rssFeedParser = functions.https.onRequest(
async (request, response) => {
const rssUrl = "https://www.feedforall.com/sample.xml";
const feed = await parser.parseURL(rssUrl);
const { items } = feed;
const batch = db.batch();
items.forEach((item) => {
const docRef = db.collection("rss").doc();
// restructure item if needed
batch.set(docRef, item);
});
await batch.commit();
response.send("Done");
}
);
Do note that you can add up to 500 documents only using Batched Writes as in the answer above. If your feed can return more than that, then you should create multiple batches of 500 or add them individually.

Firebase cloud function: http function returns null

Here is what I am trying to do.
I am introducing functionality to enable users to search for local restaurants.
I created a HTTP cloud function, so that when the client delivers a keyword, the function will call an external API to search for the keyword, fetch the responses, and deliver the results.
In doing #2, I need to make two separate url requests and merge the results.
When I checked, the function does call the API, fetch the results and merge them without any issue. However, for some reason, it only returns null to the client.
Below is the code: could someone take a look and advise me on where I went wrong?
exports.restaurantSearch = functions.https.onCall((data,context)=>{
const request = data.request;
const k = encodeURIComponent(request);
const url1 = "an_url_to_call_the_external_API"+k;
const url2 = "another_url_to_call_the_external_API"+k;
const url_array = [ url1, url2 ];
const result_array = [];
const info_array = [];
url_array.forEach(url=>{
return fetch(url, {headers: {"Authorization": "API_KEY"}})
.then(response=>{
return response.json()
})
.then(res=>{
result_array.push(res.documents);
if (result_array.length===2) {
const new_result_array_2 = [...new Set((result_array))];
new_result_array_2.forEach(nra=>{
info_array.push([nra.place_name,nra.address_name])
})
//info_array is not null at this point, but the below code only return null when checked from the client
return info_array;
}
})
.catch(error=>{
console.log(error)
return 'error';
})
})
});
Thanks a lot in advance!
You should use Promise.all() instead of running each promise (fetch request) separately in a forEach loop. Also I don't see the function returning anything if result_array.length is not 2. I can see there are only 2 requests that you are making but it's good to handle all possible cases so try adding a return statement if the condition is not satisfied. Try refactoring your code to this (I've used an async function):
exports.restaurantSearch = functions.https.onCall(async (data, context) => {
// Do note the async ^^^^^
const request = data.request;
const k = encodeURIComponent(request);
const url1 = "an_url_to_call_the_external_API" + k;
const url2 = "another_url_to_call_the_external_API" + k;
const url_array = [url1, url2];
const responses = await Promise.all(url_array.map((url) => fetch(url, { headers: { "Authorization": "API_KEY" } })))
const responses_array = await Promise.all(responses.map((response) => response.json()))
console.log(responses_array)
const result_array: any[] = responses_array.map((res) => res.documents)
// Although this if statement is redundant if you will be running exactly 2 promises
if (result_array.length === 2) {
const new_result_array_2 = [...new Set((result_array))];
const info_array = new_result_array_2.map(({place_name, address_name}) => ({place_name, address_name}))
return {data: info_array}
}
return {error: "Array length incorrect"}
});
If you'll be running 2 promises only, other option would be:
// Directly adding promises in Promise.all() instead of using map
const [res1, res2] = await Promise.all([fetch("url1"), fetch("url2")])
const [data1, data2] = await Promise.all([res1.json(), res2.json()])
Also check Fetch multiple links inside of forEach loop

Download files before build in gatsby wordpress

I have a client that im working with who needs his pdfs to be readable in browser and the user doesn't need to download them first and it turned out to not be an option to do it through Wordpress so I thought I can download them in gatsby before build everytime if they don't already exist and I was wondering if this is possible.
I found this repo: https://github.com/jamstack-cms/jamstack-ecommerce
that shows a way to do it with this code:
function getImageKey(url) {
const split = url.split('/')
const key = split[split.length - 1]
const keyItems = key.split('?')
const imageKey = keyItems[0]
return imageKey
}
function getPathName(url, pathName = 'downloads') {
let reqPath = path.join(__dirname, '..')
let key = getImageKey(url)
key = key.replace(/%/g, "")
const rawPath = `${reqPath}/public/${pathName}/${key}`
return rawPath
}
async function downloadImage (url) {
return new Promise(async (resolve, reject) => {
const path = getPathName(url)
const writer = fs.createWriteStream(path)
const response = await axios({
url,
method: 'GET',
responseType: 'stream'
})
response.data.pipe(writer)
writer.on('finish', resolve)
writer.on('error', reject)
})
}
but It doesn't seem to work if i put it in my createPages and i cant use it outside it either because i don't have access to graphql to query the data first.
any idea how to do this?
WordPress source example is defined as async:
exports.createPages = async ({ graphql, actions }) => {
... so you can already use await to download your file(-s) just after querying data (and before createQuery() call). It should (NOT TESTED) be as easy as:
// Check for any errors
if (result.errors) {
console.error(result.errors)
}
// Access query results via object destructuring
const { allWordpressPage, allWordpressPost } = result.data
const pageTemplate = path.resolve(`./src/templates/page.js`)
allWordpressPage.edges.forEach(edge => {
// for one file per edge
// url taken/constructed from some edge property
await downloadImage (url);
createPage({
Of course for multiple files you should use Promise.all to wait for [resolving] all [returned promise] downloads before creating page:
allWordpressPage.edges.forEach(edge => {
// for multiple files per edge(page)
// url taken/constructed from some edge properties in a loop
// adapth 'paths' of iterable (edge.xxx.yyy...)
// and/or downloadImage(image) argument, f.e. 'image.someUrl'
await Promise.all(
edge.node.someImageArrayNode.map( image => { return downloadImage(image); }
);
createPage({
If you need to pass/update image nodes (for components usage) you should be able to mutate nodes, f.e.:
await Promise.all(
edge.node.someImageArrayNode.map( image => {
image["fullUrl"] = `/publicPath/${image.url}`;
return downloadImage(image.url); // return Promise at the end
}
);
createPage({
path: slugify(item.name),
component: ItemView,
context: {
content: item,
title: item.name,
firstImageUrl: edge.node.someImageArrayNode[0].fullUrl,
images: edge.node.someImageArrayNode

Scrape paginate using nodejs, cheerio

How can I scrape data from a pagination ?
My code is work well with one pages, but I need to scrap all data from page 2, page 3 ... and push into an ebooks array.
Here is my code
function searchEbooks(query) {
return fetch(getUrl(1, query))
.then(res => res.text())
.then(body => {
const ebooks = [];
$('article').each(function(i, element) {
const $element = $(element);
const $title = $element.find('.entry-title a');
const $image = $element.find('.attachment-post-thumbnail');
const $description = $element.find('.entry-summary');
const authors = [];
$(element).find('.entry-author a').each(function(i, element) {
author = $(element).text();
authors.push(author);
});
const ebook = {
image: $image.attr('src'),
title: $title.text(),
description: $description.text(),
authors: authors,
}
ebooks.push(ebook);
});
return ebooks;
});
}
I have no idea how to do this. Please give me a hint or an example.
I use cherrio, node-fetch packages.
Thank you.
Try this to get next url:
var href = $('.current+a').attr('href');
if(href){
// you can check this url
} else {
console.log('You get all page');
}

node.js Get.Request & Pagination & Async

I'm having a tremendously tough time organizing the flow here as I'm self-taught so wondering if someone might be able to assist.
var channelIds = ['XYZ','ABC','QRS']
var playlistIds = [];
var videoIds = [];
ORDER OF PROCESS
1. Get All Playlist IDs: If returning Get Request JSON contains nextPageToken run Get Request again with that page before going to (2)
2. Get All Video IDs: If returning Get Request JSON contains nextPageToken run Get Request again with that page before going to (3)
3. Aggregate into Final Array: I need put all in an array such as:
var ArrFinal = [{channelId,playlistID,videoId},{channelId,playlistID,videoId},{channelId,playlistID,videoId}];
I don't necessarily need someone to write the whole thing. I'm trying to better understand the most efficient way to know when the previous step is done, but also handle the nextPageToken iteration.
i'm not familiar with the youtube api.
But what you basically need is a get function for each endpoint. This function should also care about the "nextPageToken".
Something like that: (not tested)
'use strict';
const Promise = require('bluebird');
const request = Promise.promisifyAll(require('request'));
const playlistEndpoint = '/youtube/v3/playlists';
const baseUrl = 'https://www.googleapis.com'
const channelIds = ['xy', 'ab', 'cd'];
const getPlaylist = async (channelId, pageToken, playlists) => {
const url = `${baseUrl}${playlistEndpoint}`;
const qs = { 
channelId,
maxResults: 25,
pageToken
};
try {
const playlistRequest = await request.getAsync({ url, qs });
const nextPageToken = playlistRequest.body.nextPageToken;
// if we already had items, combine with the new ones
const items = playlists ? playlists.concat(playlistRequest.body.items) : playlistRequest.body.items;
if (nextPageToken) {
// if token, do the same again and pass results to function
return getPlaylist(channelId, nextPageToken, items);
}
// if no token we are finished
return items;
}
catch (e) {
console.log(e.message);
}
};
const getVideos = async (playlistId, pageToken, videos) => {
// pretty much the same as above
}
function awesome(channelIds) {
const fancyArray = [];
await Promise.map(channelIds, async (channelId) => {
const playlists = await getPlaylist(channelId);
const videos = await Promise.map(playlists, async (playlistId) => {
const videos = await getVideos(playlistId);
videos.forEach(videoId => {
fancyArray.push({ channelId, playlistId, videoId })
})
});
});
return fancyArray;
}
awesome(channelIds)
// UPDATE
This may be a lot concurrent requests, you can limit them by using
Promise.map(items, item => { somefunction() }, { concurrency: 5 });

Resources