I am trying to make a web crawler which crawls IMDB and lists the movie name and rating. This is my index.js file.
Suppose i am crawling for 10 movies. I am then saving the crawled results in a different file say 'message.txt'. Now i want to send this message.txt file as a response to any request. But whenever I make a request it always send me an empty file to my browser initially. Then i notice that it takes some time before the crawled results are saved in the message.txt file. I think this is because all actions are asynchronous in nodejs. So is there a way to send the message.txt file only after crawling is complete?
var express = require('express');
var app = express();
var cheerio = require('cheerio');
var request = require('request');
var fs = require('fs');
app.listen(8080);
console.log('Running');
app.get('/', function(req, res) {
console.log('Recieved the get Request');
var i = 1;
var count = 0;
while (count < 10) {
var url = 'http://www.imdb.com/title/tt' + i + '/';
console.log(url);
count = count + 1;
i = i + 1;
request(url, function(error, response, html) {
if (!error) {
var $ = cheerio.load(html);
var title, ratings, released;
var json = {
title: '',
ratings: '',
released: ''
};
$('.title_wrapper').filter(function() {
var data = $(this);
json.title = data.children().first().text().trim();
json.released = data.children().last().children().last().text().trim();
});
$('.ratingValue').filter(function() {
var data = $(this);
json.ratings = parseFloat(data.text().trim());
});
console.log(json);
fs.appendFile('message.txt', JSON.stringify(json, null, 4) + '\n', function(err) {});
};
});
};
res.sendFile(__dirname + '/index.js');
});
You can use the async package which is great for controlling flow, something like:
console.log('Recieved the get Request');
var i = 1;
var count = 0;
while (count < 10) {
var url = 'http://www.imdb.com/title/tt' + i + '/';
console.log(url);
count = count + 1;
i = i + 1;
async.waterfall([
function sendRequest (callback) {
if (!error) {
var $ = cheero.load(html);
var json = {
title: '',
ratings: '',
released: ''
}
}
$('.title_wrapper').filter(function() {
var data = $(this);
json.title = data.children().first().text().trim();
json.released = data.children().last().children().last().text().trim();
});
$('.ratingValue').filter(function() {
var data = $(this);
json.ratings = parseFloat(data.text().trim());
});
callback(null, JSON.stringify(json, null, 4) + '\n');
},
function appendFile (json, callback) {
fs.appendFile('message.txt', json, function(err) {
if (err) { callback(err); }
callback();
});
}
], function(err) {
res.sendFile(__dirname + '/index.js');
});
fs.appendFile('message.txt', JSON.stringify(json, null, 4) + '\n', function(err) {
//This part is executed after the process has been completed
});
You have to make a callback there as that part will be only called when your operation has been performed.
We are utilizing the callback feature here although there isn't any concrete callback except the err in our case, we don't need any other badly though.
Please try.
fs.appendFile() is asynchronous so the stuff you append to the file won't be there right away when the function returns. So if you want to read send that file to the user, you'll need to do it inside the callback you supply to fs.appendFile().
app.get('/', function(req, res) {
...
fs.appendFile(
'message.txt',
JSON.stringify(json, null, 4) + '\n',
function(err) {
if (err) {
// Log the error and send a message to the user here
return;
}
res.sendFile(__dirname + '/index.js')
}
);
};
});
};
});
You may be tempted to use fs.appendFileSync() instead. That would be fine for a command line tool, but since this is a web server, do not do that. It will lock up the thread while the I/O happens.
Related
I looked at other questions regarding this topic but can't wrap my head around how to implement it in this case.
What I am trying to achieve:
Visit site and get content (body)
Visit matching test site and get content (body)
Compare content
Crawl links on page1
Crawl links on page2
Continue
The problem I am having at the moment is that I cannot compare the content because the requests are not waiting for each other.
Here's what my code looks like at the moment.
require('colors');
var request = require('request');
var cheerio = require('cheerio');
var jsdiff = require('diff');
var URL = require('url-parse');
var PROD_START_URL = "https://www.somesite.org";
var MAX_PAGES_TO_VISIT = 100;
var pagesVisited = {};
var numPagesVisited = 0;
var pagesToVisit = [];
var globalProdContent;
var globalTestContent;
var url = new URL(PROD_START_URL);
var baseUrl = url.protocol + "//" + url.hostname;
pagesToVisit.push(PROD_START_URL);
crawl();
function crawl() {
if(numPagesVisited >= MAX_PAGES_TO_VISIT) {
console.log("Reached max limit of number of pages to visit.");
return;
}
var nextPage = pagesToVisit.pop();
if (nextPage in pagesVisited) {
// We've already visited this page, so repeat the crawl
crawl();
} else {
// New page we haven't visited
visitPage(nextPage, crawl);
}
}
function visitPage(url, callback) {
// Add page to our set
pagesVisited[url] = true;
numPagesVisited++;
// Make the request
console.log("Visiting page " + url);
request(url, function(error, response, body) {
// Check status code (200 is HTTP OK)
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
// Parse the document body
var $ = cheerio.load(body);
globalProdContent = $("#wrapper").text();
// Build new URL for test site
var testURL = url.replace("https://www.somesite.org", "http://matching.testsite");
// Scrape test site
scrapeTestContent(testURL);
collectInternalLinks($);
callback();
});
}
function collectInternalLinks($) {
var relativeLinks = [];
relativeLinks = $("a[href]");
console.log("Found " + relativeLinks.length + " relative links on page");
relativeLinks.each(function() {
pagesToVisit.push(baseUrl + "/" + $(this).attr('href'));
});
}
function scrapeTestContent(testURL) {
console.log("Visiting matching testpage " + testURL);
request(testURL, function(error, response, body) {
console.log("Status code: " + response.statusCode);
if(response.statusCode !== 200) {
callback();
return;
}
var $ = cheerio.load(body);
globalTestContent = $("#wrapper").text();
console.log(globalTestContent);
});
}
Is there an easier way to do this or am I completely off the track?
This can be done in two ways:
1. Add callback to scrapeTestContent
function scrapeTestContent(testURL, cb) {
...
request(testURL, function(error, response, body) {
cb();
});
In visitPage,
function visitPage(url, callback) {
...
scrapeTestContent(testURL, () => collectInternalLinks($));
}
Using ES6 promises. In scrapeTestContent() return new Promise((resolve, reject) => {}. Then in visitPage, use following construct: scrapeTestContent(testUrl).then(() => collectInternalLinks($))
I am using Mongodb to store video files as grid fs. It surprised me today when I came to know that video is not playing on Safari browser. However video read from Gridfs is playing fine on Chrome & Firefox. Following are two approach to read video files back from Grid fs. Both approach has same problem. I do the that correct mime type is getting set.
Approach 1:
exports.previewFile = function (req, res) {
var contentId = new DBModule.BSON.ObjectID(req.params.fileid);
log.debug('Calling previewFile inside FileUploadService for content id ' + contentId);
//Read metadata details from fs.files
var query = {_id: contentId};
documentOperationModule.getDocumentByQuery(query, constants.FS_FILES_COLLECTION, function (err, files) {
if (!Utilities.isEmptyList(files)) {
var fileObj = files[0];
var gridStore = DBModule.db.gridStore(contentId, 'r');
gridStore.open(function (err, gridStore) {
var stream = gridStore.stream(true);
if (!Utilities.isEmptyObject(fileObj.metadata)) {
res.setHeader('Content-Type', fileObj.metadata.contentType);
}
stream.on("data", function (chunk) {
log.debug("Chunk of file data");
res.write(chunk);
});
stream.on("end", function () {
log.debug("EOF of file");
res.end();
});
stream.on("close", function () {
log.debug("Finished reading the file");
});
});
} else {
log.error({err: err}, 'Failed to read the content for id ' + contentId);
res.status(constants.HTTP_CODE_INTERNAL_SERVER_ERROR);
res.json({error: contentId + " not found"});
}
});
};
Approach 2:
exports.previewFile = function (req, res) {
var contentId = new DBModule.BSON.ObjectID(req.params.fileid);
log.debug('Calling previewFile inside FileUploadService for content id ' + contentId);
//Read metadata details from fs.files
var query = {_id: contentId};
documentOperationModule.getDocumentByQuery(query, constants.FS_FILES_COLLECTION, function (err, files) {
if (!Utilities.isEmptyList(files)) {
var fileObj = files[0];
var gridStore = DBModule.db.gridStore(contentId, 'r');
gridStore.read(function (err, data) {
if (!err) {
if (!Utilities.isEmptyObject(fileObj.metadata)) {
res.setHeader('Content-Type', fileObj.metadata.contentType);
}
res.end(data);
} else {
log.error({err: err}, 'Failed to read the content for id ' + contentId);
res.status(constants.HTTP_CODE_INTERNAL_SERVER_ERROR);
res.json({error: err});
}
});
} else {
log.error({err: err}, 'Failed to read the content for id ' + contentId);
res.status(constants.HTTP_CODE_INTERNAL_SERVER_ERROR);
res.json({error: contentId + " not found"});
}
});
};
Following is screen of Safari for reference.
Please help
Try this GIST (by https://gist.github.com/psi-4ward)
It makes use of the byte range header
https://gist.github.com/psi-4ward/7099001
Although it does not work for me with safari, it makes sure that the correct hears are set and the correct content is delivered. It could narrow down your problem
EDIT
I've updated the GIST. It works now fine with Safari for me
https://gist.github.com/derMani/218bd18cc926d85a57a1
This should solve your problem
function StreamGridFile(req, res, GridFile) {
if(req.headers['range']) {
// Range request, partialle stream the file
console.log('Range Reuqest');
var parts = req.headers['range'].replace(/bytes=/, "").split("-");
var partialstart = parts[0];
var partialend = parts[1];
var start = parseInt(partialstart, 10);
var end = partialend ? parseInt(partialend, 10) : GridFile.length -1;
var chunksize = (end-start)+1;
res.writeHead(206, {
'Content-disposition': 'filename=xyz',
'Accept-Ranges': 'bytes',
'Content-Type': GridFile.contentType,
'Content-Range': 'bytes ' + start + '-' + end + '/' + GridFile.length,
'Content-Length': chunksize
});
// Set filepointer
GridFile.seek(start, function() {
// get GridFile stream
var stream = GridFile.stream(true);
// write to response
stream.on('data', function(buff) {
// count data to abort streaming if range-end is reached
// perhaps theres a better way?
if(start >= end) {
// enough data send, abort
GridFile.close();
res.end();
} else {
res.write(buff);
}
});
});
} else {
// stream back whole file
console.log('No Range Request');
res.header('Content-Type', GridFile.contentType);
res.header('Content-Length', GridFile.length);
var stream = GridFile.stream(true);
stream.pipe(res);
}
}
Regards
Rolf
I'm working on a cache warming script in node. It's using both the Async and Request npm modules. The only real changes in the example code are variable names and values.
var languages = ['es', 'en'];
var baseUrl = 'http://localhost:8080/';
var presents = [1,2,3,4];
var foods = ['green eggs', 'roast beast', 'potatas'];
async.each(languages, function (lang, callback) {
async.each(whos, function (who, callback2) {
request.get(baseUrl + lang + '/' + who, function (err, res, body) {
var grinches = body;
async.each(grinches, function (grinch, callback3) {
async.each(presents, function (present, callback4) {
async.each(foods, function (food, callback5) {
var finalUrl = baseUrl + lang + '/comp/' + who + '/' + grinch + '/' + present + '/' + food;
console.log(finalUrl);
request.get(baseUrl + 'en/ac', function (error, response, body) {
console.log(error);
console.log(response);
console.log(body);
callback5();
});
}, function () {
callback4();
});
}, function () {
callback3();
});
});
}, function () {
callback2();
});
});
}, function () {
callback();
});
}, function () {
cb(null, 7);
});
In this example the finalUrl successfully logs but nothing within the immediately following request will log. If I grab one of the finalUrl's and drop it in a browser the request will process correctly. Is there some kind of stack limit within Async or Request that I'm possibly hitting?
If instead of making requests a that point, I push the urls into an array and then loop over that to make requests from the final async.each callback they will work. However, this solution has performance (memory) issues because I have to maintain massive fast-growing arrays. Here's an example of that solution
var languages = ['es', 'en'];
var baseUrl = 'http://localhost:8080/';
var presents = [1,2,3,4];
var foods = ['green eggs', 'roast beast', 'potatas'];
var urls = [];
async.each(languages, function (lang, callback) {
async.each(whos, function (who, callback2) {
request.get(baseUrl + lang + '/' + who, function (err, res, body) {
var grinches = body;
async.each(grinches, function (grinch, callback3) {
async.each(presents, function (present, callback4) {
async.each(foods, function (food, callback5) {
var finalUrl = baseUrl + lang + '/comp/' + who + '/' + grinch + '/' + present + '/' + food;
urls.push(finalUrl);
}, function () {
callback4();
});
}, function () {
callback3();
});
});
}, function () {
callback2();
});
});
}, function () {
callback();
});
}, function () {
for (var i = 0; i < urls.length; i++) {
request.get(urls[i], function (err, res, body) {
console.log('this works');
})
}
});
What else might I try?
Using async.each will execute all the tasks in parallel, this might cause some performance issues specially when there are a lot of tasks in the tasks array, as a work around "though it might be slower in execution!" you might want to use async.eachSeries to insure that all your tasks run one by one instead of executing them in parallel.
I am new to Node.js world, kind of stuck in situation.
below code is for reference:
var http = require('http');
var step = require('step');
var request = require('request');
exports.readimage2 = function(req, res){
//res.send(200,'OK');
//var image_url = 'http://www.letsgodigital.org/images/artikelen/39/k20d-image.jpg'; //--- 10mb
//var image_url = 'http://upload.wikimedia.org/wikipedia/commons/2/2d/Snake_River_(5mb).jpg';
//var image_url = 'http://www.sandia.gov/images2005/f4_image1.jpg'; //--- 2mb
var image_url = 'http://www.fas.org/nuke/guide/pakistan/pakistan.gif'; // --- some KB
http.get(image_url,
function(responseData) {
var data = new Buffer(parseInt(responseData.headers['content-length'],10));
var pos = 0;
responseData.on('data', function(chunk) {
chunk.copy(data, pos);
pos += chunk.length;
});
responseData.on('end', function () {
res.send(200, data);
});
});
};
Above code fails working for large files if i use it with step module.
Anyone suggest how to do it properly with step.
Here how i did it using step..... although the request module did same for image buffer download thanks to a post on stackoverflow just need to set encoding to null in request to work for buffer response.
var canvas = new Canvas(3000, 3000),
ctx = canvas.getContext('2d'),
Image = Canvas.Image;
var image_url = "http://www.a2hosting.com/images/uploads/landing_images/node.js-hosting.png";
//var image_url = 'http://upload.wikimedia.org/wikipedia/commons/1/16/AsterNovi-belgii-flower-1mb.jpg';
step(
function() {
request.get({
url: image_url,
encoding: null
}, this);
},
function(err, response, body) {
var img = new Image;
img.src = body;
ctx.drawImage(img, 0, 0, img.width, img.height);
//res.send(200, data);
res.send(200, '<img src="' + canvas.toDataURL() + '" />');
}
);
Below is the code working for simple http module of node.
var http = require('http');
var step = require('step');
var request = require('request');
exports.imagedownload = function(req, res){
step(
function(){
console.log('*********** image download start ***********');
fndownload(this);
},
function(err, result){
if(err) {
}
console.log('*********** image download end ***********');
res.send(200, result);
}
);
};
function fndownload(callback) {
var image_url = 'http://upload.wikimedia.org/wikipedia/commons/2/2d/Snake_River_(5mb).jpg'; // --- some KB
http.get(image_url,
function(responseData) {
var data = new Buffer(parseInt(responseData.headers['content-length'],10));
var pos = 0;
responseData.on('data', function(chunk) {
chunk.copy(data, pos);
pos += chunk.length;
});
responseData.on('end', function () {
//res.send(200, data);
callback(null, data);
});
});
};
I have the code below and am trying to access the all_records array once the _.each function has completed. However as it is asynchronous I was wondering if was possible to force a callback onto the underscores each?
var request = require('request'),
cheerio = require('cheerio'),
_ = require('underscore');
var all_records = [];
_.each([0,100], function(start) {
var base_url = "http://www.example.com/search?limit=100&q=foobar&start=";
var url = base_url + start;
request(url, function(err, res, body) {
var $ = cheerio.load(body),
links = $('#results .row');
$(links).each(function(i, link) {
var $link = $(link);
var record = {
title: $link.children('.title').text().trim()
};
all_records.push(record);
});
});
});
// Need to run this once _.each has completed final iteration.
console.log(all_records);
Here is a simple solution using a simple synchronization method:
var count = 101;//there are 101 numbers between 0 and 100 including 0 and 100
_.each([0,100], function(start) {
var base_url = "http://www.example.com/search?limit=100&q=foobar&start=";
var url = base_url + start;
request(url, function(err, res, body) {
var $ = cheerio.load(body),
links = $('#results .row');
$(links).each(function(i, link) {
var $link = $(link);
var record = {
title: $link.children('.title').text().trim()
};
all_records.push(record);
count--;
if(count===0){//101 iterations done
console.log(all_records);
}
});
});
});
A more elegant solution can be accomplied by using async's .parallel method.
var requests = []; //an array for all the requests we will be making
for(var i=0;i<=100;i++){
requests.push((function(done){ //create all the requests
//here you put the code for a single request.
//After the push to all_records you make a single done() call
//to let async know the function completed
}).bind(null,i));//the bind is that so each function gets its own value of i
}
async.parallel(requests,function(){
console.log(all_records);
});
async.each ended up being the easiest to implement.
async.each([0,100], function(start) {
var base_url = "http://www.example.com/search?limit=100&q=foobar&start=";
var url = base_url + start;
request(url, function(err, res, body) {
var $ = cheerio.load(body),
links = $('#results .row');
$(links).each(function(i, link) {
var $link = $(link);
var record = {
title: $link.children('.title').text().trim()
};
all_records.push(record);
});
});
}, function(err){
console.log(all_records);
});