Related
I need to save data and file as a new project to my Mongo. For this I am using formidable.
My POST method looks like this:
exports.create = async (req, res) => {
let form = new formidable.IncomingForm();
form.keepExtensions = true;
form.parse(req, (err, fields, files) => {
if (err) {
return res
.status(400)
.json({ errors: [{ msg: 'Image could not be uploaded' }] });
}
const {
title,
description,
photo,
tags,
git,
demo,
projectType,
} = fields;
//get links object
const projectFields = {};
projectFields.creator = req.user._id;
if (title) projectFields.title = title;
if (title) projectFields.description = description;
if (photo) projectFields.photo = photo;
if (projectType) projectFields.projectType = projectType;
if (tags) {
projectFields.tags = tags.split(',').map((tag) => tag.trim());
}
//get links object
projectFields.links = {};
if (git) projectFields.links.git = git;
if (demo) projectFields.links.demo = demo;
//1kb = 1000
//1mb = 1000000kb
//name 'photo' mus match client side. use photo
if (files.photo) {
if (files.photo.size > 1000000) {
return res.status(400).json({
errors: [{ msg: 'Image could not be uploaded. File to big.' }],
});
}
//this relates to data in schema product
project.photo.data = fs.readFileSync(files.photo.path);
project.photo.contentType = files.photo.type;
}
});
I want to use async/await so I am using try{}catch(err){} for my project.save(). I am initializing all my fields where I have also nested links. Unfortunately this is not working as I thought it will work. Right now my POST is returning 500. I am sitting on this and right now I am at the point that this can get a bit messy and not even close to any solution.
I am running into a sync/async issue with my program and am having a hard time implementing async into my middleware. I have tried a few times and broken it a bunch, what would be a fairly simple way to implement async await?
Here is the route
router.post("/testmedia", uploadFiles, testMedia);
Here is the middleware
const uploadFiles = (req,res,next)=> {
let b2fileIDs = [];
let savedPhotoLinks = {};
upload(req,res,function(err){
req.files.forEach(function(image){
var b2accountId;
var b2authToken;
var b2uploadURL;
//console.log(image);
b2.authorize()
.then(function(auth){
b2accountId = auth.accountId;
b2authToken = auth.authorizationToken;
}).then(function(ret){
b2.getUploadUrl(process.env.B2BUCKET_ID)
}).then(function(cb){
b2uploadURL = cb.uploadUrl;
b2authToken = cb.authorizationToken;
}).then(function(cb){
b2.uploadFile({
uploadUrl: b2uploadURL,
uploadAuthToken: b2authToken,
filename: 'test',
data: image.buffer
}).then(function(cb){
var uploadLink = {link: cb.fileId};
b2fileIDs.push(uploadLink);
}).then(function(){
successFn(response){
console.log(b2fileIDs);
//Save ID's to DB here
}
}).catch((error) => {
console.log("This is an ERROR " + error );
});
})
console.log("1");
console.log('2');
})
console.log('3');
console.log(b2fileIDs);
})
console.log('4');
next();
};
The problem is that when I upload multiple files the async nature moves on without capturing the b2fileIDs line.
I want to do web scraping of this site.
I have seen that the APIs are available but, as suggested by duraid in my previous question, it is not advisable to use them.
So I tried to use Node.js and Phantom.js with Phantom.
This is my code:
var phantom = require('phantom');
// object of methods
var methods = {};
var loadInProgress = false;
var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';
methods.download = async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
await page.on('onConsoleMessage', function(msg) {
console.info(msg);
});
await page.on('onLoadStarted', function() {
loadInProgress = true;
console.log('Load started...');
});
await page.on('onLoadFinished', function() {
loadInProgress = false;
console.log('Load end');
});
const status = await page.open(url);
console.log('STATUS:', status);
const content = await page.property('content');
console.log('CONTENT:', content);
// submit
await page.evaluate(function() {
document.getElementById('crID%3a250').value = 'crID%3a250'; // France
document.getElementById('timeID%3a79').value = 'timeID%3a79'; // 2015
document.getElementById('varID%3a2').value = 'varID%3a2'; // Medium
document.getElementById('ctl00_main_filters_anchorApplyBottom').submit(); // submit button
});
var result = await page.evaluate(function() {
return document.querySelectorAll('html')[0].outerHTML;
});
console.log('RESULT:', result);
await instance.exit();
};
module.exports = methods;
(How can they select more countries and more years?)
I tried to select France as Country or Area, 2015 as a Year and medium as a Variants.
So crID%3a250 is id of element:
<input type="checkbox" id="crID%3a250" value="crID%3a250" name="France" />
<label for="crID%3a250">France</label><br />
timeID%3a79 is id of element:
<input type="checkbox" id="timeID%3a79" value="timeID%3a79" name="2015" />
<label for="timeID%3a79">2015</label><br />
varID%3a2 is id of element:
<input type="checkbox" id="varID%3a2" value="varID%3a2" name="Medium" />
<label for="varID%3a2">Medium</label><br />
And then ctl00_main_filters_anchorApplyBottom is id of button element:
<div class="All">
<img src="_Images/IconUpdateResults.png" alt="Update" width="11px" height="11px" title="Apply filters" /> Apply Filters
</div>
But what I got is the web page itself (in HTML), not the data that interest me.
So it's as if I had not selected any parameters. Why?
EDIT 1
After the advice of #Vaviloff I tried to change the code but without success.
My server-side language is Node.js.
Using Phantom I modified the code like this:
methods.download = async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.log('Requesting', requestData.url);
});
await page.on('onConsoleMessage', function(msg) {
console.log(msg);
});
const status = await page.open(url);
console.log('\n\nSTATUS:', status);
// submit
await page.evaluate(function() {
var countries = {
'Albania': 'crID%3a8',
'Austria': 'crID%3a40',
'Belgium': 'crID%3a56',
'Bulgaria': 'crID%3a100',
'Croatia': 'crID%3a191',
'Cyprus': 'crID%3a196',
'Denmark': 'crID%3a208',
'Estonia': 'crID%3a233',
'Finland': 'crID%3a246',
'France': 'crID%3a250',
'Germany': 'crID%3a276',
'Greece': 'crID%3a300',
'Iceland': 'crID%3a352',
'Ireland': 'crID%3a372',
'Italy': 'crID%3a380',
'Latvia': 'crID%3a428',
'Netherlands': 'crID%3a528',
'Norway': 'crID%3a578',
'Poland': 'crID%3a616',
'Portugal': 'crID%3a620',
'Romania': 'crID%3a642',
'Slovakia': 'crID%3a703',
'Slovenia': 'crID%3a705',
'Spain': 'crID%3a724',
'Sweden': 'crID%3a752',
'Switzerland': 'crID%3a756',
'United Kingdom': 'crID%3a826'
};
// 2018 - 1980
var years = ['timeID%3a83', 'timeID%3a82', 'timeID%3a81', 'timeID%3a79', 'timeID%3a78', 'timeID%3a77', 'timeID%3a76', 'timeID%3a75', 'timeID%3a73', 'timeID%3a72', 'timeID%3a71', 'timeID%3a70', 'timeID%3a69', 'timeID%3a67', 'timeID%3a66', 'timeID%3a65', 'timeID%3a64', 'timeID%3a63', 'timeID%3a61', 'timeID%3a60', 'timeID%3a59', 'timeID%3a58', 'timeID%3a57', 'timeID%3a55', 'timeID%3a54', 'timeID%3a53', 'timeID%3a52', 'timeID%3a51', 'timeID%3a49', 'timeID%3a48', 'timeID%3a47', 'timeID%3a46', 'timeID%3a45', 'timeID%3a43', 'timeID%3a42', 'timeID%3a41', 'timeID%3a40', 'timeID%3a39', 'timeID%3a37'];
// select countries
for(var c in countries) {
document.getElementById(countries[c]).setAttribute('checked', true);
}
// select years
for(var y in years) {
document.getElementById(years[y]).setAttribute('checked', true);
}
// select variants
document.getElementById('varID%3a2').setAttribute('checked', true); // medium
// click button
document.getElementById('ctl00_main_filters_anchorApplyBottom').click();
});
console.log('\nWaiting 1.5 seconds...');
await timeout(1500);
// get only the table contents
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
});
console.log('\n\nRESULT:', result);
elaborateResult(result);
await instance.exit();
};
function elaborateResult(res) {
var el = document.createElement('html'); // ** ERROR HERE **
el.innerHTML = result;
console.log('\n\nTD ELEMENTS:', el.getElementsByTagName('td'));
//var obj = utilFunc.createJsonObjectPop(year, country, population);
//console.log(obj);
}
There are two errors:
result contains only the values that are on the first page of the results, but with the selections made you get 22 pages of results and I don't understand how I can get all the values that interest me and link them in the variable result.
assuming to have solved the problem in point (1), now I should elaborate the results obtained and create an object like this:
var date = [{year: 2018, country: 'Albania', population: 2934.363}, {year: 2017, country: 'Albania', population: 2930.187}, ..., {year: 1980, country: 'United Kingdom ', population: 56265.475}]
This is what the elaborateResult(res) function should do (of course, the function is not complete, I have to finish it but I get an error at the first line), but I get the error:
ReferenceError: document is not defined
So I changed my strategy and I tried not to use Phantom but a normal request:
var options = {
uri: 'http://data.un.org/Handlers/DataHandler.ashx?Service=query&Anchor=variableID%3a12&Applied=crID%3a8&crID%3a40;timeID%3a79&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=302',
transform: function(body) {
return cheerio.load(body);
}
};
methods.download = async function(req, res) {
request(options)
.then(function($) {
console.log('\n\nTHEN: ', $);
})
.catch(function(err) {
console.log('Error', err.stack());
});
}
If I run this code I get:
THEN: function (selector, context, r, opts) {
if (!(this instanceof initialize)) {
return new initialize(selector, context, r, opts);
}
opts = _.defaults(opts || {}, options);
return Cheerio.call(this, selector, context, r || root, opts);
}
In this case I have other problems.
I don't know how to build the url.
In the example above I chose Albania (crID% 3a8) and Austria (crID% 3a40) and 2015 as year (timeID% 3a79).
Yet if I go to the link just built, I get as a result the data on Albania from 2100 to 2095.
I don't know how to select the years or how to select variants or how to change pages.
I feel a bit stupid but I can't get what I want... I'm stuck.
Help would be very welcome!
There are several issues with your script that prevent successful scrape.
To check a checkbox, you don't set its value again (it's already set in HTML!), you set its checked attribute to true:
document.getElementById('crID%3a250').setAttribute("checked", true); // France
The button that submits the form is a hyperlink <a> which doesn't have a submit method, it should be clicked (it even has onClick function in the code)
document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // submit the form
**The search request ** is sent through ajax and takes time to complete, so your script should wait for at least a second vefore trying to fetch the data. I'll show how to wait in the full working code below.
Next, you may get only the table data, no need to sip through all th HTML:
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
});
Here's a bit trimmed down version of you script with issues corrected:
var phantom = require('phantom');
var url = 'http://data.un.org/Data.aspx?q=population&d=PopDiv&f=variableID%3A12';
// A promise to wait for n of milliseconds
const timeout = ms => new Promise(resolve => setTimeout(resolve, ms));
(async function(req, res) {
const instance = await phantom.create();
const page = await instance.createPage();
await page.on('onResourceRequested', function(requestData) {
console.info('Requesting', requestData.url);
});
await page.on('onConsoleMessage', function(msg) {
console.info(msg);
});
const status = await page.open(url);
await console.log('STATUS:', status);
// submit
await page.evaluate(function() {
document.getElementById('crID%3a250').setAttribute("checked", true); // France
document.getElementById('timeID%3a79').setAttribute("checked", true); // 2015
document.getElementById('varID%3a2').setAttribute("checked", true); // Medium
document.getElementById('ctl00_main_filters_anchorApplyBottom').click(); // click submit button
});
console.log('Waiting 1.5 seconds..');
await timeout(1500);
// Get only the table contents
var result = await page.evaluate(function() {
return document.querySelectorAll('.DataContainer table')[0].outerHTML;
});
await console.log('RESULT:', result);
await instance.exit();
})();
The last but not the least observation is that you could simply try to replay an ajax request made by the form and find out that the URL of search request works quite well on its own, when just opened in another tab:
You don't even need a headless browser to get it, just cUrl/requests and process. It happens with sites a lot, so it's useful to check network tab in your browser devtools before scraping.
Update
And if there are so many results that they are scattered over several pages, there is one more parameter to be used in request: Page:
data.un.org/Handlers/DataHandler.ashx?Service=page&Page=3&DataFilter=variableID:12&DataMartId=PopDiv&UserQuery=population&c=2,4,6,7&s=_crEngNameOrderBy:asc,_timeEngNameOrderBy:desc,_varEngNameOrderBy:asc&RequestId=461
The problem with current headless Chrome is that there is no API to render the full page you only get the "window" that you set in CLI parameter.
I am using the chrome-remote-interface module, this is the capture example:
const fs = require('fs');
const CDP = require('chrome-remote-interface');
CDP({ port: 9222 }, client => {
// extract domains
const {Network, Page} = client;
Page.loadEventFired(() => {
const startTime = Date.now();
setTimeout(() => {
Page.captureScreenshot()
.then(v => {
let filename = `screenshot-${Date.now()}`;
fs.writeFileSync(filename + '.png', v.data, 'base64');
console.log(`Image saved as ${filename}.png`);
let imageEnd = Date.now();
console.log('image success in: ' + (+imageEnd - +startTime) + "ms");
client.close();
});
}, 5e3);
});
// enable events then start!
Promise.all([
// Network.enable(),
Page.enable()
]).then(() => {
return Page.navigate({url: 'https://google.com'});
}).catch((err) => {
console.error(`ERROR: ${err.message}`);
client.close();
});
}).on('error', (err) => {
console.error('Cannot connect to remote endpoint:', err);
});
To render the full page, one slower and hack solution would be partial rendering. Set fixed height and scroll through the page and take the screenshots after every X pixels. The problem is that how to drive the scrolling part? Would it be better to inject custom JS or is it doable through the Chrome remote interface?
Have you seen this?
https://medium.com/#dschnr/using-headless-chrome-as-an-automated-screenshot-tool-4b07dffba79a
This bit sound's like it would solve your issue:
// Wait for page load event to take screenshot
Page.loadEventFired(async () => {
// If the `full` CLI option was passed, we need to measure the height of
// the rendered page and use Emulation.setVisibleSize
if (fullPage) {
const {root: {nodeId: documentNodeId}} = await DOM.getDocument();
const {nodeId: bodyNodeId} = await DOM.querySelector({
selector: 'body',
nodeId: documentNodeId,
});
const {model: {height}} = await DOM.getBoxModel({nodeId: bodyNodeId});
await Emulation.setVisibleSize({width: viewportWidth, height: height});
// This forceViewport call ensures that content outside the viewport is
// rendered, otherwise it shows up as grey. Possibly a bug?
await Emulation.forceViewport({x: 0, y: 0, scale: 1});
}
setTimeout(async function() {
const screenshot = await Page.captureScreenshot({format});
const buffer = new Buffer(screenshot.data, 'base64');
file.writeFile('output.png', buffer, 'base64', function(err) {
if (err) {
console.error(err);
} else {
console.log('Screenshot saved');
}
client.close();
});
}, delay);
});
Chrome remote interface supports simulating scrolling gestures using the Input domain.
// scroll down y axis 9000px
Input.synthesizeScrollGesture({x: 500, y: 500, yDistance: -9000});
more info:
https://chromedevtools.github.io/devtools-protocol/tot/Input/
You may also be interested in the Emulation domain. dpd's answer contains a few now removed methods. I believe Emulation.setVisibleSize might work for you.
https://chromedevtools.github.io/devtools-protocol/tot/Emulation/
I am setting up a server with Node and Express for the first time and am having trouble saving the response I am retrieving in my PUT call. This is a survey - I need to update the model with the "responded" object entered in the survey.
I do see the correct response outputting in the console but receive "Object [object Object],[object Object],[object Object],[object Object],[object Object] has no method 'findById'" from my "save" function.
Thank you in advance.
kitty-questions.json
[
{
"id": "favorite-food",
"number": "1",
"url": "favorite-food",
"name": "Favorite Food",
"question": "Which of the following best describes your kitty's palatte?",
"responded" : "default response",
"query": "Which of the following best describes your kitty's palatte?",
"answers": {
"Grumpy" : "Fresh Water Salmon, no bones, served on china",
"Hipster" : "Nothing - trying to fit into newer, tighter jeans",
"Pudge" : "Anything and everything my owner is eating",
"Bub" : "Mice",
"Meow" : "Roaches"
}
},
{
"id": "favorite-band",
"number": "2",
"url": "favorite-band",
"name": "Favorite Band",
"question": "Your kitty claws at you desperatly when it wants to listen to:",
"responded" : "default response",
"query": "Which of the following best describes your kitty's palatte?",
"answers": {
"Bub" : "Country",
"Grumpy" : "Mozart. Popular music is for the plebs.",
"Pudge" : "z100",
"Meow" : "Very heavy metal",
"Hipster" : "something long winded"
}
}
Server.js
var express = require('express'),
http = require('http'),
questions = require('./data/kitty-questions');
var app = express()
.use(express.bodyParser())
.use(express.static('public'));
app.get('/questions', function (req, res) {
res.json(questions);
});
app.post('/questions', function (req, res) {
var matches = questions.filter(function (question) {
return question.url === req.body.url;
});
if (matches.length > 0) {
res.json(409, {status: 'question already exists'});
} else {
req.body.id = req.body.url;
questions.push(req.body);
res.json(req.body);
}
});
app.put('/questions/:question_name', function (req, res) {
var matches = questions.filter(function (question) {
return question.url === req.params.question_name;
});
var catResponse = req.body.responded;
console.log(JSON.stringify(catResponse));
return questions.findById(req.params.question_name, function (err, question) {
question.catResponse = req.body.responded;
return question.save(function (err) {
if (!err) {
console.log("updated");
} else {
console.log(err);
}
return res.send(question);
});
});
});
app.get('/questions/:question_name', function (req, res) {
var matches = questions.filter(function (question) {
return question.url === req.params.question_name;
});
if (matches.length > 0) {
res.json(matches[0]);
} else {
res.json(404, {status: 'invalid survey question'});
}
});
app.delete('/questions/:question_name', function (req, res) {
var found = false;
items.forEach(function (question, index) {
if (question.url === req.params.question_name) {
found = index;
}
});
if (found) {
items.splice(found, 1);
res.json(200, {status: 'deleted'});
} else {
res.json(404, {status: 'invalid survey question deletion'});
}
});
app.get('/*', function (req, res) {
res.json(404, {status: 'not found'});
});
http.createServer(app).listen(3000, function () {
console.log("Server ready at http://localhost:3000");
});
STRING FROM THE TERMINAL AFTER MAKING PUT CALL:
Server ready at http://localhost:3000
TypeError: Object [{"id":"favorite-food","number":"1","url":"favorite-food","name":"Favorite Food","question":"Which of the following best describes your kitty's palatte?","responded":"default response","query":"Which of the following best describes your kitty's palatte?","answers":{"Grumpy":"Fresh Water Salmon, no bones, served on china","Hipster":"Nothing - trying to fit into newer, tighter jeans","Pudge":"Anything and everything my owner is eating","Bub":"Mice","Meow":"Roaches"}},{"id":"favorite-band","number":"2","url":"favorite-band","name":"Favorite Band","question":"Your kitty claws at you desperatly when it wants to listen to:","responded":"default response","query":"Which of the following best describes your kitty's palatte?","answers":{"Bub":"Country","Grumpy":"Mozart. Popular music is for the plebs.","Pudge":"z100","Meow":"Very heavy metal","Hipster":"something long winded"}},{"id":"favorite-hideout","number":"3","url":"favorite-hideout","name":"Favorite Hideout","question":"You are most likely to find your beast perched here:","responded":"","answers":{"Bub":"On your shoulder","Grumpy":"Alone. Anywhere, just alone.","Pudge":"In the fridge","Meow":"Herding other cats","Hipster":"Outside, smoking."}},{"id":"favorite-friends","number":"4","url":"favorite-friends","name":"Favorite Friends","question":"Your kitty generally gets along with:","responded":"","answers":{"Bub":"Other cats","Grumpy":"No one.","Pudge":"Humans, animals, whoever.","Meow":"Obedient animals","Hipster":"dogs"}},{"id":"favorite-celebrity","number":"5","url":"favorite-celebrity","name":"Favorite Celebrity","question":"Your feline cannot get enough of this red carpet walker:","responded":"","answers":{"Bub":"Meg Ryan","Grumpy":"Jack Nicholson","Pudge":"John Candy","Meow":"Does General McArthur count?","Hipster":"Zooey Deschanel"}}] has no method 'update'
3/19 UPDATE:
app.put('/questions/:question_name', function (req, res) {
var question = questions.filter(function (question) {
return question.url === req.params.question_name;
});
var defaultResponse = question[0].responded;
res.json(defaultResponse);
var catResponse = req.body.responded;
questions.update({id: req.params.question_name}, function (err, question) {
question.catResponse = catResponse;
question.save(function (err) {
if (!err) {
res.send(catResponse);
} else {
res.send(400); //or something
}
});
});
});
There are a lot of unnecessary returns going on here, and at the very least, they make the code confusing to read.
Removing some stuff and ignoring the matches variable, since that isn't used in the PUT itself, something like this may be more what you're looking for:
app.put('/questions/:question_name', function (req, res) {
var catResponse = req.body.responded;
questions.update({id: req.params.question_name}, function (err, question) {
question.catResponse = catResponse;
question.save(function (err) {
if (!err) {
res.send(question);
} else {
res.send(400); //or something
}
});
});
});
*EDIT*
I assumed that questions = require('./data/kitty-questions'); was your mongoose model. You need questions to be a mongoose model for update to work.
Like:
var mongoose = require('mongoose')
, Questions = mongoose.model('Question')
Then your questions model file probably looks like:
var mongoose = require('mongoose');
var Schema = mongoose.Schema;
var ObjectId = Schema.ObjectId;
QuestionSchema = new Schema({
//keys and stuff in here
});
mongoose.model('Question', QuestionSchema);