Firebase function will not deploy when requiring outside packages - node.js

I am having trouble deploying my web scraping function and do not know how to fix the issue.
Index.js
const functions = require("firebase-functions");
const pup = require("puppeteer");
const WebsiteData = require('./schema');
exports.scrape = functions
.runWith({ memory: '1GB' })
.pubsub.schedule('0 0 * * *')
.onRun(async (context) => {
const browser = await pup.launch({
headless: true,
args: ['--no-sandbox', '--disable-setuid-sandbox']
});
const page = await browser.newPage();
await page.setUserAgent(
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.125 Safari/537.36'
)
var pages = [
{
name: 'aave-protocol',
fee:'0.09',
url: 'https://aave.com',
},
{
name: 'uniswap-v2',
fee:'0.30',
url: 'https://uniswap.org',
},
{
name: 'dydx',
fee:'0.00',
url: 'https://dydx.exchange',
},
{
name: 'dodo-bsc',
fee:'0.00',
url: 'https://dodoex.io',
},
{
name: 'cream-finance',
fee:'0.03',
url: 'https://cream.finance',
},
{
name: 'multiplier-finance',
fee:'0.06',
url: 'https://multiplier.finance',
}
]
var result = [];
for (var each in pages) {
await page.goto(`https://www.dapp.com/app/${pages[each].name}`, { waitUntil: 'networkidle0' })
var users = await page.evaluate(() => {
return document.querySelector('#app > div.app-container > div.root.dapp-detail > section.detail-page-outer > div > div.left-sec > div.stats-card > div.chart-section > div:nth-child(2) > div:nth-child(4) > span.value > span').innerText
})
var volume = await page.evaluate(() => {
return document.querySelector('#app > div.app-container > div.root.dapp-detail > section.detail-page-outer > div > div.left-sec > div.stats-card > div.chart-section > div:nth-child(3) > div:nth-child(4) > span.value > span:nth-child(2)').innerText
})
var obj = { "name": `${pages[each].name}`, "nDUniqueUsers": `${users}`, "nDTransactions": `${volume}`, "fee": `${pages[each].fee}`, "url": `${pages[each].url}`};
result.push(obj);
}
await browser.close();
const websiteMongo = new WebsiteData({ sites: result });
await websiteMongo.save(function(err,data){
if (err) {
console.log(err);
return null;
}
});
console.log("Done.")
return null;
});
The function is meant to use puppeteer to open up around 5 pages collect the info and upload the data to a MongoDB database. The code works perfectly in local host, but when I run firebase deploy, I get an error. Here is the error message: "Function failed on loading user code. This is likely due to a bug in the user code. Error message: Error: please examine your function logs to see the error cause: https://cloud.google.com/functions/docs/monitoring/logging#viewing_logs. Additional troubleshooting documentation can be found at https://cloud.google.com/functions/docs/troubleshooting#logging. Please visit https://cloud.google.com/functions/docs/troubleshooting for in-depth troubleshooting documentation."}"
I know that the problem consists of these two lines:
const pup = require("puppeteer");
const WebsiteData = require('./schema');
When I comment out those two lines I can deploy the function. Here is the code for schema.js:
var mongoose = require('mongoose')
mongoose.connect("URI");
var Schema = mongoose.Schema
var websiteData = new Schema({
sites: [{
name: {
type: String,
required : true,
},
nDUniqueUsers: {
type: String,
required : true,
},
nDTransactions: {
type: String,
required : true,
}, fee: {
type: String,
required : true,
}, url: {
type: String,
required : true,
}
}],
})
var WebsiteData = mongoose.model("SiteData", websiteData);
module.exports = WebsiteData
I do not know how to fix this. Any help would be greatly appreciated.

I suspect you haven't included "puppeteer" in the dependencies section of the package.json deployed alongside your function. Or possibly you've inadvertently included it in the devDependencies section instead of dependencies.
// package.json
{
// ...
"dependencies": {
"puppeteer": "^13.5.1" // be sure you have this
}
}

Related

A non-serializable value was detected in the state: redux toolkit

I am trying to implement Redux toolkit store in a react native application. It is my first time. I am using axios to make requests to the API.
I am getting this error on a post request:
A non-serializable value was detected in the state, in the path: `user`. Value: {"_A": null,
"_x": 0, "_y": 0, "_z": null}
Take a look at the reducer(s) handling this action type: userreg/fulfilled.
Here are my codes:
const initialState={
token: '',
session: '',
loading: false,
}
export const logInUser = createAsyncThunk('userreg', async(name, thunkAPI)=>{
const {user_username, user_password} = name
//const m = JSON.stringify(name)
const res = await axios.post(`${BASE_URL}/login`,
{
user_username, user_password
}
)
//console.log(res.data)
return res.data
})
const userSlice = createSlice({
name: 'user',
initialState,
reducers:{
addSession:(state, action)=>{
state.session = AsyncStorage.getItem('sessionId')
},
addToken:(state, action)=>{
state.token = AsyncStorage.getItem('token')
}
},
extraReducers: (builders)=>{
builders.addCase(logInUser.pending, (state, action)=>{
state.loading = true
})
builders.addCase(logInUser.fulfilled, async(state, {payload})=>{
console.log(payload, 'slode ier')
state.loading = false
state.token = payload.userToken;
state.session = payload.session;
await AsyncStorage.setItem('sessionId', payload.session);
await AsyncStorage.setItem('token', payload.userToken)
})
}
})
export default userSlice.reducer;
const handleUserLog = async ()=>{
const logDetails = {
user_username: userLogin.user_username,
user_password: userLogin.user_password,
}
dispatch(logInUser(logDetails))
}
When I implemented the same code with fetch instead of axios to make the same request to the API, it didn't give this error.

Adding additional spec files to an angular project, not loading/defining correctly?

Caveat: I am not the author of this project. Whoever originally wrote this is no longer with the organization and I am seemingly the most knowledgeable on this topic at this point.
I know a little about javascript and unit tests, so I successfully added one .spec.js file. I tried adding a second one for another module, reusing a lot of the spec setup, and it immediately broke.
Project resources:
Nodejs 12.16.1
jasmine-node-karma: "^1.6.1"
karma: "^6.3.12"
Contents of ./karma.conf.js:
module.exports = function(config) {
config.set({
basePath: './public',
frameworks: ['jasmine', 'jquery-3.2.1'],
files: [
"../node_modules/angular/angular.js",
"../node_modules/angular-mocks/angular-mocks.js",
"../node_modules/bootstrap/dist/js/bootstrap.js",
"../public/**/*.js",
],
exclude: [
],
preprocessors: {
},
client: {
captureConsole: true
},
browserConsoleLogOptions: {
terminal: true,
level: ""
},
reporters: ['progress'],
port: 9876,
colors: true,
logLevel: config.LOG_INFO,
autoWatch: true,
browsers: ['FirefoxHeadless', 'ChromeHeadlessNoSandbox', 'PhantomJS'],
customLaunchers: {
ChromeHeadlessNoSandbox: {
base: 'ChromeHeadless',
flags: ['--no-sandbox']
},
FirefoxHeadless: {
base: 'Firefox',
flags: ['-headless'],
}
},
singleRun: false,
concurrency: Infinity
})
}
Originally I added ./public/controllers.spec.js to match the existing ./public/controllers.js. These unit tests pass and continue to do so.
Yesterday I added ./public/backups/backupcontrollers.spec.js to match ./public/backups/backupcontrollers.js.
Contents of ./public/backups/backupcontrollers.js:
/**
* Angular controller.
*/
'use strict'
const backupApp = angular.module('backup', [])
const backupTypePath = 'elasticsearch'
backupApp.controller('BackupFormController', ['$scope', '$filter', '$http', function ($scope, $filter, $http) {
console.log('Started BackupFormController')
$scope.itemInstances = []
$scope.fetchStatus = 'Ready!'
$scope.processSelection = function (item, backupType = backupTypePath) {
$scope.currentItem = item.metadata.name
$scope.getBackup(backupType)
console.log('currentItem after selecting from dropdown: ' + $scope.currentItem)
}
$scope.init = function (backupType = backupTypePath) {
$scope.refreshItemInstances(backupType)
console.log('currentItem after loading page for first time: ' + $scope.currentItem)
}
$scope.getBackup = function (backupType = backupTypePath) {
const path = `/v1/backup/${backupType}`
$scope.fetchStatus = `Fetching Backups for Item ${$scope.currentItem}...`
console.log(`Fetching backups for item from ${path}`)
$http.get('/api', { headers: { path: path, item: $scope.currentItem } })
.success(function (data, status, headers, config) {
console.log(`Got data from GET on path ${path}, HTTP status ${status}: ${JSON.stringify(data)}`)
if (typeof data === 'string' || data instanceof String) {
$scope.backups = data.split(/\r?\n/)
} else {
$scope.backups = data
}
$scope.fetchStatus = 'Ready!'
console.log('Done fetching backup list for item:' + $scope.currentItem + '!')
})
.error(function (data, status, header, config) {
console.log(data)
$scope.fetchStatus = 'Ready!'
})
}
// Refresh the list of displayed Item instances
$scope.refreshItemInstances = function (backupType = backupTypePath) {
console.log('Fetching list of all items in the system ...')
$scope.fetchStatus = 'Fetching Items ... '
$http.get('/env')
.success(function (data, status, headers, config) {
console.log(data)
for (let i = 0; i < data.length; i++) {
$scope.itemInstances.push(data[i])
}
$scope.currentItem = $scope.itemInstances[0].metadata.name
console.log('Done fetching list of all items!')
console.log('currentItem after fetching list of all items: ' + $scope.currentItem)
$scope.fetchStatus = 'Ready!'
$scope.getBackup(backupType)
})
.error(function (data, status, header, config) {
console.log(data)
$scope.fetchStatus = 'Ready!'
})
}
}])
Contents of ./public/backups/backupcontrollers.spec.js:
describe('BackupFormController', function () {
let $controller, $rootScope, $httpBackend
beforeEach(module('backup'))
const mockBackupString = 'string of backup data'
const mockBackupData = {
body: mockBackupString
}
const mockItemsUnsorted = [
{
metadata: {
name: 'prod-mock-1',
spec: 'asdf',
status: 'ok'
},
notes: []
},
{
metadata: {
name: 'dev-mock-1',
spec: 'asdf',
status: 'ok'
},
notes: []
},
{
metadata: {
name: 'integ-mock-1',
spec: 'asdf',
status: 'ok'
},
notes: []
}
]
beforeEach(inject(function ($injector) {
$rootScope = $injector.get('$rootScope')
const $controller = $injector.get('$controller')
$httpBackend = $injector.get('$httpBackend')
const mockEnv = $httpBackend.when('GET', '/env')
.respond(mockItemsUnsorted)
const mockAPI = $httpBackend.when('GET', '/api')
.respond(mockBackupString)
const createController = function () {
return $controller('BackupFormController', { '$scope': $rootScope })
}
}))
describe('$scope.getBackup', function () {
beforeEach(function () {
spyOn(console, 'log')
})
it('should GET /api and set $scope.backups', function () {
controller = createController()
console.log('Dumping fetchStatus: ', $rootScope.fetchStatus)
$rootScope.init()
$httpBackend.flush()
expect($rootScope.backups).toEqual(mockBackupString)
expect(console.log).toHaveBeenCalled()
})
})
})
It seems like this new spec isn't working correctly at all; when I run npm test I see the normal successful tests from ./public/controllers.spec.js but also:
Chrome Headless 105.0.5195.125 (Mac OS 10.15.7) BackupFormController $scope.getBackup should GET /api and set $scope.backups FAILED
ReferenceError: createController is not defined
at UserContext.<anonymous> (backup/backupcontrollers.spec.js:51:7)
at <Jasmine>
This is the only output concerning ./public/backups/backupcontrollers.spec.js.
Has anybody run into this before? I found some posts regarding including angular-mocks, but as you can see in karma.conf.js, it's being included.

Error: "Missing initializer in const declaration" plaid api create link token

I'm trying to setup the plaid api in my node.js code and I need to be able to make a request for the create_link_token. The sample code from their docs is as follows:
const request: LinkTokenCreateRequest = {
user: {
client_user_id: 'user-id',
},
client_name: 'Plaid Test App',
products: ['auth', 'transactions'],
country_codes: ['US'],
language: 'en',
webhook: 'https://sample-web-hook.com',
redirect_uri: 'https://domainname.com/oauth-page.html',
account_filters: {
depository: {
account_subtypes: ['DepositoryAccountSubtype.Checking, DepositoryAccountSubtype.Savings'],
},
},
};
try {
const response = await plaidClient.linkTokenCreate(request);
const linkToken = response.data.link_token;
} catch (error) {
// handle error
}
my code is:
app.post('/api/create_link_token', async (req, res, next) => {
const request: LinkTokenCreateRequest = {
user: {
client_user_id: 'user-id',
},
client_name: 'Plaid Test App',
products: ['auth', 'transactions'],
country_codes: ['US'],
language: 'en',
webhook: 'https://sample-web-hook.com',
redirect_uri: 'https://domainname.com/oauth-page.html',
account_filters: {
depository: {
account_subtypes: ['DepositoryAccountSubtype.Checking, DepositoryAccountSubtype.Savings'],
},
},
};
try {
const response = await plaidClient.linkTokenCreate(request);
const linkToken = response.data.link_token;
} catch(e) {
handleError(e);
}
});
Right off the bat I get the error: 'LinkTokenCreateRequest' refers to a value, but is being used as a type here. Did you mean 'typeof LinkTokenCreateRequest'?ts(2749) as a red underline underneath LinkTokenCreateRequest. Side note I've never used TS before this, but I believe I have to use it on this project because some of their components require it. If I do as they suggest and change it to typeof LinkTokenCreateRequest = {... then the red underline error goes away, however upon starting the server I get the error:
const request: typeof LinkTokenCreateRequest = {
^^^^^^^
SyntaxError: Missing initializer in const declaration
I've very confused as to how I can make this work so any suggestions would be much appreciated.

Node puppeteer scraping YouTube and encountering redirected you too many times

I'm trying to scrape a YouTube playlists URL using Node / puppeteer. It was working, but now I'm getting ERR_TOO_MANY_REDIRECTS error. I can still access the page using chrome from my desktop.
I've tried using the chromium browser and chrome browsers. I've also tried using the puppeteer-extra stealth plugin and the random-useragent.
This is how my code stand at the moment:
const browser = await puppeteer.launch({
stealth: true,
headless: false // true,
executablePath: "C:\\Program Files (x86)\\Google\\Chrome\\Application\\chrome.exe",
args: [
'--disable-notifications', '--disable-features=site-per-process'
],
defaultViewport: null
});
const page = await browser.newPage()
await page.setUserAgent(random_useragent.getRandom());
await page.goto(<playlist-url, {
waitUntil: 'networkidle2',
timeout: 0
})
await page.waitForSelector('button[aria-label="Agree to the use of cookies and other data for the purposes described"')
It at the page.goto it bombs. And it happens even if I try going to https://www.youtube.com.
Any suggestions what I should try next. I tried a proxy server but couldn't get it to work. I suspect I need a proxy to actually route through.
If all you need is playlist IDs for a given channel, it's possible to query a feed at:
https://youtube.com/feeds/videos.xml?channel_id=<Channel ID>
To get IDs of videos you can query a feed at:
https://youtube.com/feeds/videos.xml?playlist_id=PLAYLIST_ID
You can get playlists (and Mixes) links from YouTube like in the code example below (also check full code the online IDE):
const puppeteer = require("puppeteer-extra");
const StealthPlugin = require("puppeteer-extra-plugin-stealth");
puppeteer.use(StealthPlugin());
const searchString = "java course";
const requestParams = {
baseURL: `https://www.youtube.com`,
encodedQuery: encodeURI(searchString), // what we want to search for in URI encoding
};
async function fillPlaylistsDataFromPage(page) {
const dataFromPage = await page.evaluate((requestParams) => {
const mixes = Array.from(document.querySelectorAll("#contents > ytd-radio-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
const playlists = Array.from(document.querySelectorAll("#contents > ytd-playlist-renderer")).map((el) => ({
title: el.querySelector("a > h3 > #video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("a#thumbnail")?.getAttribute("href")}`,
channel: {
name: el.querySelector("#channel-name a")?.textContent.trim(),
link: `${requestParams.baseURL}${el.querySelector("#channel-name a")?.getAttribute("href")}`,
},
videoCount: el.querySelector("yt-formatted-string.ytd-thumbnail-overlay-side-panel-renderer")?.textContent.trim(),
videos: Array.from(el.querySelectorAll("ytd-child-video-renderer a")).map((el) => ({
title: el.querySelector("#video-title")?.textContent.trim(),
link: `${requestParams.baseURL}${el.getAttribute("href")}`,
length: el.querySelector("#length")?.textContent.trim(),
})),
thumbnail: el.querySelector("a#thumbnail #img")?.getAttribute("src"),
}));
return [...mixes, ...playlists];
}, requestParams);
return dataFromPage;
}
async function getYoutubeSearchResults() {
const browser = await puppeteer.launch({
headless: false,
args: ["--no-sandbox", "--disable-setuid-sandbox"],
});
const page = await browser.newPage();
const URL = `${requestParams.baseURL}/results?search_query=${requestParams.encodedQuery}`;
await page.setDefaultNavigationTimeout(60000);
await page.goto(URL);
await page.waitForSelector("#contents > ytd-video-renderer");
const playlists = await fillPlaylistsDataFromPage(page);
await browser.close();
return playlists;
}
getYoutubeSearchResults().then(console.log);
📌Note: to get thumbnail you need to scroll playlist into view (using .scrollIntoView() method).
Output:
[
{
"title":"Java Complete Course | Placement Series",
"link":"https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"channel":{
"name":"Apna College",
"link":"https://www.youtube.com/c/ApnaCollegeOfficial"
},
"videoCount":"35",
"videos":[
{
"title":"Introduction to Java Language | Lecture 1 | Complete Placement Course",
"link":"https://www.youtube.com/watch?v=yRpLlJmRo2w&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"length":"18:46"
},
{
"title":"Variables in Java | Input Output | Complete Placement Course | Lecture 2",
"link":"https://www.youtube.com/watch?v=LusTv0RlnSU&list=PLfqMhTWNBTe3LtFWcvwpqTkUSlB32kJop",
"length":"42:36"
}
],
"thumbnail":null
},
{
"title":"Java Tutorials For Beginners In Hindi",
"link":"https://www.youtube.com/watch?v=ntLJmHOJ0ME&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
"channel":{
"name":"CodeWithHarry",
"link":"https://www.youtube.com/c/CodeWithHarry"
},
"videoCount":"113",
"videos":[
{
"title":"Introduction to Java + Installing Java JDK and IntelliJ IDEA for Java",
"link":"https://www.youtube.com/watch?v=ntLJmHOJ0ME&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
"length":"19:00"
},
{
"title":"Basic Structure of a Java Program: Understanding our First Java Hello World Program",
"link":"https://www.youtube.com/watch?v=zIdg7hkqNE0&list=PLu0W_9lII9agS67Uits0UnJyrYiXhDS6q",
"length":"14:09"
}
],
"thumbnail":null
}
]
You can read more about scraping YouTube playlists from blog post Web scraping YouTube secondary search results with Nodejs.

Reduce period for web scraping job with puppeter node.js

I have made a job script for web scraping periodically a page and save some information in a MongoDB database. I have tried to get as much performance as i can, and for now i'm able to execute the script each 10s. However, i would like to reduce it even more, with a period between 1-10 seconds if possible. The problem is that when i reduce it, my code throws the following warning and some executions get stacked unresolved:
(node:9472) MaxListenersExceededWarning: Possible EventEmitter memory leak detected. 11 exit listeners added. Use emitter.setMaxListeners() to increase limit
Is there a way to improve the code?
const $ = require('cheerio');
const MarketModel = require('./models/marketModel');
const mongoose = require('mongoose');
const puppeteer = require('puppeteer');
var schedule = require('node-schedule');
const {
Cluster
} = require('puppeteer-cluster');
//Connection to DataBase:
mongoose.connect('mongodb://localhost:27017/Tradheo', {
useNewUrlParser: true
});
mongoose.connection.on('error', error => console.log(error));
mongoose.Promise = global.Promise;
getMarketData = async () => {
console.log("Web scraping to get market data...")
let markets = []
let marketSpain = {
country: 'Spain',
name: 'IBEX 35',
companies: []
}
let marketGermany = {
country: 'Germany',
name: 'DAX',
companies: []
}
const cluster = await Cluster.launch({
concurrency: Cluster.CONCURRENCY_PAGE,
maxConcurrency: 2,
});
await cluster.task(async ({
page,
data: url
}) => {
await page.goto({
waitUntil: 'domcontentloaded'
});
await page.setRequestInterception(true);
page.on('request', request => {
if (request.resourceType() === 'document') {
request.continue();
} else {
request.abort();
}
});
const html = await page.content();
if (url === 'https://uk.investing.com/equities/spain') {
console.log('Spain data page content loaded');
$("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
marketSpain.companies.push({
name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
last: $("td", elem).eq(2).text(),
high: $("td", elem).eq(3).text(),
low: $("td", elem).eq(4).text(),
change: $("td", elem).eq(5).text(),
changePerCent: $("td", elem).eq(6).text(),
volume: $("td", elem).eq(7).text(),
time: $("td", elem).eq(8).text(),
purchase: false,
sale: false
});
});
markets.push(marketSpain);
} else {
console.log('Germany data page content loaded');
$("table[class='genTbl closedTbl crossRatesTbl elpTbl elp30'] > tbody > tr", html).each((i, elem) => {
marketGermany.companies.push({
name: $("td[class='bold left noWrap elp plusIconTd'] > a", html).eq(i).html(),
last: $("td", elem).eq(2).text(),
high: $("td", elem).eq(3).text(),
low: $("td", elem).eq(4).text(),
change: $("td", elem).eq(5).text(),
changePerCent: $("td", elem).eq(6).text(),
volume: $("td", elem).eq(7).text(),
time: $("td", elem).eq(8).text(),
purchase: false,
sale: false
});
});
markets.push(marketGermany);
}
if (markets.length === 2) {
MarketModel.create({
markets,
}, (err) => {
if (err) return handleError(err);
})
console.log("Done!")
}
});
cluster.queue(url1);
cluster.queue(url2);
await cluster.idle();
await cluster.close();
}
var j = schedule.scheduleJob('*/10 * 8-17 * * 1-5', function () {
const now = new Date();
//Checks that time is between 8:30 - 17:35 (schedule of the stock exchange)
if (now.getHours() >= 8 && !(now.getHours() == 8 && now.getMinutes() < 30) && now.getHours() <= 17 && !(now.getHours() == 17 && now.getMinutes() > 35)) {
getMarketData();
}
});
UPDATE: I have added some improvements like setting waitUntil property to 'domcontentloaded' and request interception to avoid waiting for images, and any kind of resources apart from html content, to be loaded. However, seems to be insufficient to achieve the goal.

Resources