I am trying to scrape data from site by spookyjs and store in mongoDB.I am able to get data from the website.But not able to save scraped data from spookyjs environment to mongoDB.To save scraped data,I passed my database model instance to spookyjs .I refered below link for it.
https://github.com/SpookyJS/SpookyJS/wiki/Introduction
Below is my code where I extracted data in prod_link_info variable and pass its values into mongoDB
var product_model = require('./product').product_model;
//get results
spooky.then([{product_model:product_model},function(){
this.waitForSelector('li[id^="product_"]', function() {
// Get info on all elements matching this CSS selector
var prod_link_info = this.evaluate(function() {
var nodes = document.querySelectorAll('li[id^="product_"]');
return [].map.call(nodes, function(node) { // Alternatively: return Array.prototype.map.call(...
return node.querySelector('a').getAttribute('href')+"\n";
});
});
//insert values in mongodb
for (var i = 0; i < prod_link_info.length; i++) {
product_model.create(
{
prod_link_info:prod_link_info[i],
}, function(err, product){
if(err) console.log(err);
else console.log(product);
});
} });
}]);
Below is the code of database schema and model used in above code.
var mongoose=require('mongoose');
var Schema = mongoose.Schema;
// create a schema
var productSchema = new Schema({
prod_link_info: String,
});
var product_model= mongoose.model('product_model', productSchema);
module.exports = {
product_model: product_model
}
But when I run above code it gives me following error ReferenceError: Can't find variable: product_model.
I want to store the data extracted from spookyjs to mongoDB.Please suggest where am I doing wrong.
When you pass hash of variables to spooky, it is converted to a string using JSON.stringify and then gets converted back to an object using JSON.parse in casper environment (please refer docs); so it is impossible to pass mongoose model to casper environment (moreover there is no actual reason for that).
To solve the problem, you should pass the data from Spooky (casper) environment. As far as I know, the only way to do is to emit data and then handle it using spooky.on. Your example should look like:
var product_model = require('./product').product_model;
//get results
spooky.then([{},function(){
this.waitForSelector('li[id^="product_"]', function() {
// Get info on all elements matching this CSS selector
var prod_link_info = this.evaluate(function() {
var nodes = document.querySelectorAll('li[id^="product_"]');
return [].map.call(nodes, function(node) { // Alternatively: return Array.prototype.map.call(...
return node.querySelector('a').getAttribute('href')+"\n";
});
});
this.emit('data.ready', prod_link_info);
});
}]);
spooky.on('data.ready', function (prod_link_info) {
//insert values in mongodb
for (var i = 0; i < prod_link_info.length; i++) {
product_model.create(
{
prod_link_info:prod_link_info[i],
}, function(err, product){
if(err) console.log(err);
else console.log(product);
});
}
});
Related
I'm new to node.js and currently working on a project using keystonejs cms and MongoDB. Now I'm stuck in getting data related to multiple collections. Because of this callback functions, I couldn't return an array with relational data. My code something similar to this sample code.
var getAgenda = function(id, callback){
callback = callback || function(){};
if(id){
AgendaDay.model.find({summit:id}).exec(function (err, results3) {
var arr_agenda = [];
var arr_agenda_item = [];
for(var key3 in results3){
AgendaItem.model.find({agendaDay:results3[key3]._id}).exec(function (err, results2){
for(var key2 in results2){
arr_agenda_item.push(
{
item_id: results2[key2]._id,
item_name: results2[key2].name,
from_time: results2[key2].time_from,
to_time: results2[key2].time_to,
desc: results2[key2].description,
fatured: results2[key2].featured,
}
);
}
arr_agenda.push(
{
name: results3[key3].name,
date: results3[key3].date,
description: results3[key3].description,
item_list:arr_agenda_item
}
);
return callback(arr_agenda);
});
}
});
}
}
exports.list = function (req, res) {
var mainarray = [];
Summit.model.find().exec(function (err, resultssummit) {
if (err) return res.json({ err: err });
if (!resultssummit) return res.json('not found');
Guest.model.find().exec(function (err, resultsguset) {
for(var key in resultssummit){
var agen_arr = [];
for(var i=0; i<resultssummit[key].guests.length; i++){
var sumid = resultssummit[key]._id;
//this is the function im trying get data and assign to mainarray
getAgenda(sumid, function(arr_agenda){
agen_arr = arr_agenda;
});
mainarray.push(
{
id: resultssummit[key]._id,
name: resultssummit[key].name,
agenda_data: agen_arr,
}
);
}
res.json({
summit: mainarray,
});
}
});
}
}
If anyone can help me out, that would be really great :)
You need to restructure this whole thing. You should not be calling mongo queries in a for loop and expecting their output at the end of the loop. Also, your response is in a for loop. That won't work.
I'll tell you how to do it. I cannot refactor all of that code for you.
Instead of putting mongodb queries in a for loop, you need to convert it in a single query. Just put the _ids in a single array and fire a single query.
AgendaItem.model.find({agendaDay:{$in:ARRAY_OF_IDS}})
You need to do the same thing for AgendaDay.model.find({summit:id}) as well.
I am looking to scrape data from specific URLs and updating an existing data in MongoDB. I am also using CRON (not sure if that has any affect on the code). Below is the code:
module.exports = new CronJob('0 */59 * * * *', function() {
nightmare
.goto('http://www.xyzxyzxyz.com/')
.wait(2000)
.evaluate(function() {
var itemS = [];
var itemList = document.querySelectorAll('.item-list');
for(var i = 0; i < item-list.length; i++) {
var item = {};
item['name'] = itemList[i].querySelector('.item-title').innerHTML;
item['image'] = itemList[i].querySelector('.item-image').src;
itemS.push(item);
}
return itemS;
})
//.end() -- removed because CRON does not restart
.then(function(result) {
var id = mongoose.Types.ObjectId("589b8d8fe860591071d4e2c6");
mongooseModel.findByIdAndUpdate(id, {$set: {key: result}}, function(err, res) {
if(err) {
console.log(err);
} else {
console.log('success!');
}
});
console.log(result);
})
.catch(function(error) {
console.error('Search failed:', error);
});
}, null, true);
the results are console.log(ged) (console.log(result)) but the console.log("success!") callback does not execute.
Also, unrelated, when I pull images from some sites, I'm getting Data URLs instead of the img url. I tried putting the data URL directly into a tag, but the image shows up blank -- I also tried decoding the image, using online resources, but the image still does not render. any possible solutions for this?
Thanks so much, in advance!
I'm trying to get data from MongoDB collection and then save it to a global object.Later I need to parse it to HTML template.
Here is my code:
When user log onto his profile: then we need to get his projects and here we call findeprojects() function
usrRouter.route('/profile')
.all(function (req,res,next) {
if(!req.user){
res.redirect('/');
}
next();
})
.get(function (req,res,userObj) {
// var proj = findprojects();
userObj = req.user;
var pro = {};
pro = findprojects(userObj);
res.render('index',{name:userObj.username, email:userObj.email});
//res.sendFile('profile.html',{root:path.join(__dirname,'../public'),},{name:userObj.username});
});
Here is findeprojects function code:
var findprojects = function(obj) {
var usern = obj.username;
mongodb.connect(url,function(err, db){
if(err) throw err;
var collection = db.collection('projects');
//console.log(usern);
collection.find({'pusername':usern});
cursor =db.collection('projects').find({ 'pusername': usern }).toArray(function(err,items){
//console.log(items);
var i;
for(i=0; i<items.length;){
userProjects.createdBy = items[i].pusername;
userProjects.proName = items[i].projectName;
userProjects.proType = items[i].projectType;
userProjects.proDesc = items[i].projectDesc;
//return userProjects;
i = i+1;
}
});
console.log(userProjects);
});
};
I have declared global object at the top like:
userProjects = {
createdBy:'',
proName:'',
proType:'',
proDesc:''
};
But when I console userprojects object after calling the findeprojects() function it displays empty values.
why dont you use mongoose to model your stuff.
its more intuitive and you no need to declare the global object and do the mapping in the for loop that you are doing.
also your approach is a bit wrong in terms of when you iterate through for aren't you overwriting ?
say you have two documents where pusername is abdul.
so in your case you loose first object which will get overwritten by the second one.
i see that you commented out a return statement but even that wont work properly.
from a design point of view your approach is not efficient.
in mongoose you can do:
{
var userProjectSchema = new mongoose.Schema({
createdBy: { type: String }
, proName: String
, proType: String
, proDesc: String
});
// Find a single document by username.
userProjectSchema.findOne({ pusername : 'abdul' }, function(err, resDoc) {
if (err) return console.error(err);
// do your html stuff here
});
// Find all documents.
userProjectSchema.find(function(err, results) {
if (err) return console.error(err);
// do your html stuff here
});
}
Through ajax request I have get the data form client and save it in mongodb database (mongoose) through save query .Now I want to know how to find data and display it new page.
I have get the data from client and save it in database. Now I want that when callback function calls it find data from the database and display it in new page using response.redirect.Please guide me.
$("#saveChanges5").live('click',function(){
var sendingObj = {
regOperation: $("#area_operation").val()
,regFieldAct: $("#field_activity").val()
,regOther: $("#other_details").val()
};
$.ajax({
url:'/WorkDetails'
,type:'post'
,data: sendingObj
,success:function(){
alert('Successfully saved')
},
error: function(){
alert("Saving Failed")
}
})
});
app.post("/WorkDetails",function(req ,res){
console.log(req.body);
saveWorkDetails(req.body.regOperation ,req.body.regFieldAct ,req.body.regOther ,function(){
res.send("");//here i want to add new code
});
});
function saveWorkDetails(area_operation ,field_activity ,other_details , callback){
console.log("save CALLED");
var receivedObj = new WorkDetailInfo({
area_operation:area_operation ,
field_activity:field_activity ,
other_details:other_details
});
console.log(receivedObj);
receivedObj.save(function(err){
console.log("inside Save ");
if(err){
//res.send(err);
console.log(err);
}
else{
callback();
}
});
}
You can make use of narwhal-mongodb APIs
Following is the example usage:
var MongoDB = require("mongodb");
var db = new MongoDB.Mongo().getDB("mydb");
var colls = db.getCollectionNames();
colls.forEach(function(el) { print(el); });
var coll = db.getCollection("testCollection");
In a node.js / Mongoose project, I have a schema which contains references to external image files.
var PageSchema = new Schema({
title: String
, media: {
digest: String
, name: String
}
});
Those files have additional properties which are stored in the file itself: url, width, height, exif fields, etc. Those fields will need to be populated before the model being sent to res.render().
For some fields, things are synchronous and a virtual just does the job:
PageSchema.virtual('media.url').get(function () {
return appPaths.fileUrl(this.media);
});
However, width / height, or exif fields require async calls. I thought of using middleware to populate them, but this does not seem to work:
PageSchema.post('init', function(next) {
var media = this.media;
var fileName = filedb.absoluteFilePath(media);
im.identify(fileName, function(err, features) {
if (err) {
media.width = 0;
media.height = 0;
} else {
media.width = features.width;
media.height = features.height;
}
next();
});
});
What am I doing wrong? Is there a common design pattern for solving this kind of problem? (Other than duplicating this information in the database itself?)
The real problem here is that mongoose currently seems to have a wonky implementation of post callbacks. While pre('init',function(next){ ... }); works as you expect, post('init',function(next){ ... }); does not actually get passed a next function. In fact, the post init callback does not receive any arguments whatsoever when it is called.
As such, I usually write a wrapper for my query callbacks to make a sort of DIY middleware:
var setAsyncVirtuals = function(callback){
return function(err, docs){
if(err) return callback(err);
var i = done = docs.length;
if(i > 0)
while(i--){
(function(i){
var filename = getFilename();
im.identify(filename, function(err, features) {
if (err) {
docs[i].media.width = 0;
docs[i].media.height = 0;
} else {
docs[i].media.width = features.width;
docs[i].media.height = features.height;
}
done--;
if(done <= 0) callback(null, docs);
});
})(i); // bind i to hold value for async call
}
else callback(null, docs);
}
}
then
Page.find({}, setAsyncVirtuals(function(err,docs){
res.send(docs); // these have media.width & media.height assigned
}));