First off, I am a total newbie both to Javascript and Node.js, so sorry if my question is stupid.
I am trying to scrape text and images off a website and export it to a pdf using request, cheerio and pdfkit, but I'm having problems.
I am able to scrape the images and save them locally using this:
var $ = cheerio.load(body);
$("#mediatab1 img").each(function(){
var image= 'http://WWW.WEBSITE.no' + $(this).attr('src');
images.push(image);
});
for(var i = 0; i < images.length; i++){
request(images[i]).pipe(fs.createWriteStream('images/' + i + '.jpg')); }
BUT! Here's the problem:
1. INTENT: When I try to write the files to the pdf using
doc.image('images/0.jpg');
all I get is
Error: Unknown image format.
at Function.PDFImage.open (C:\nodejs\node_modules\pdfkit\js\im
age.js:41:15)
at PDFDocument.module.exports.image (C:\nodejs\node_modules\pd
fkit\js\mixins\images.js:27:26)
at Request._callback (C:\nodejs\prosjekt.js:29:6)
at Request.self.callback (C:\nodejs\node_modules\request\reque
st.js:344:22)
at Request.emit (events.js:98:17)
at Request.<anonymous> (C:\nodejs\node_modules\request\request
.js:1239:14)
at Request.emit (events.js:117:20)
at IncomingMessage.<anonymous> (C:\nodejs\node_modules\request
\request.js:1187:12)
at IncomingMessage.emit (events.js:117:20)
at _stream_readable.js:944:16
0.jpg is 0 bytes, so I suspect there is a timing issue here?
2. INTENT
I tried to use .pipe instead of saving locally:
request(images[i]).pipe(doc.image(images[0]));
But all I get is:
"Error: ENOENT, no such file or directory 'C:\nodejs\http:\www.WEBSITE.no\Common\Tools\ImageScaler.ashx?id=c7d73548-8198-4bd1-867d-33fc0dfe73d1&h=4
13'
Any idea how to fix this or to solve the problem in any other way?
Here's the whole script:
var request = require('request'),
cheerio = require('cheerio'),
PDFDocument = require('pdfkit'),
doc = new PDFDocument,
fs = require('fs'),
prompt = require('prompt');
bilder = [];
prompt.start();
prompt.get(['prosjekturl'], function (err, result) {
request({url: 'http://www.WEBSITE.no/no/Prosjekter/Prosjekt/?pid=' + result.prosjekturl, encoding:null}, function(err, resp, body){
if(!err && resp.statusCode == 200){
// console.log(body);
var $ = cheerio.load(body);
$("#mediatab1 img").each(function(){
var bilde = 'http://www.WEBSITE.no' + $(this).attr('src');
bilder.push(bilde);
});
console.log(bilder);
for(var i = 0; i < bilder.length; i++){
request(bilder[i]).pipe(fs.createWriteStream('images/' + i + '.jpg'));
}
$("#MiddleRightContainer h1").each(function(){
var tittel = $(this).text();
console.log(tittel);
doc.pipe(fs.createWriteStream('pdf/output.pdf'));
doc.font('fonts/FONT-Regular.ttf');
doc.fontSize(32);
doc.text(tittel);
});
$("#MiddleRightContainer .user-content p").each(function(){
var tekst = $(this).text();
console.log(tekst);
doc.pipe(fs.createWriteStream('pdf/output.pdf'));
doc.fontSize(12);
doc.text(tekst);
});
$("#RightSidebar div.box2").each(function(){
var fakta = $(this).text();
console.log(fakta);
});
}
doc.end();
});
});
From docs:
"PDFKit supports the JPEG and PNG formats"
Here you can see it's checking .jpeg and .png extensions. Yours is a .jpg file. I've had trouble with this a few times and this has fixed my issue.
When it comes to the file loading I suspect that the windows paths can be a problem. Try and use node.js built-in path resolution: https://nodejs.org/api/path.html
When it comes to request and loading images it should not be more than:
request({
url: url,
// Prevents Request from converting response to string
encoding: null
}, function (err, response, body) {
doc.image(body)
})
Hope it helps some.
Related
So I tested my scraping on a static HTML file before adding it to my Node app.
The problem is that it's not returning all the rows.
On the site:
$('#sport tr').length
//Returns 13
In Cheerio:
$('#sport tr').length
//Returns 2
I'm stumped, here is the code I'm using. I've contained the URL as proof, so you can visit it yourself if you wish.
I'm suspecting it's something to do with var $ = cheerio.load(html); however I'm not experienced in Cheerio to say outright that's the problem.
var express = require('express');
var fs = require('fs');
var request = require('request');
var cheerio = require('cheerio');
var app = express();
app.get('/scrape', function(req, res){
var url = 'http://www.olbg.com/football.php';
var json = [];
request(url, function(error, response, html){
if(!error){
var $ = cheerio.load(html);
console.log($('#sport tr').length);
var headers = [];
$('#sport tr th').each(function(i, th) {
var text = $(th).text();
if (text.trim() !== "") {
headers[i] = text.replace(/[\t\n\r\s]/mgi, '');
}
});
$('#sport tr').each(function(i, tr) {
// skip if header
if (!$(tr).is('th')) {
var temp = {};
temp["Event"] = $(tr).find('td').eq(0).text().trim();
temp["TopSelection"] = $(tr).find('td').eq(1).text().trim();
temp["BookieOdds"] = $(tr).find('td').eq(2).text().trim();
temp["OLBGRating"] = $(tr).find('td').eq(3).find('img').length;
if (temp["Event"] !== "" || temp["TopSelection"] !== ""){
json.push(temp);
}
}
});
}
// To write to the system we will use the built in 'fs' library.
// In this example we will pass 3 parameters to the writeFile function
// Parameter 1 : output.json - this is what the created filename will be called
// Parameter 2 : JSON.stringify(json, null, 4) - the data to write, here we do an extra step by calling JSON.stringify to make our JSON easier to read
// Parameter 3 : callback function - a callback function to let us know the status of our function
fs.writeFile('output.json', JSON.stringify(json), function(err){
console.log('File successfully written!');
})
// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send(json);
});
});
app.listen("8081");
console.log("Magic happens on port 8081");
exports = module.exports = app;
The reason that you're not getting the expected result is because the (table) html on that page is mangled. If you look at the second <td> in the second <tr> of the table#sport, you'll see an "extra" </td>. This causes the <td> that the table#sport is inside to close (and an implicit closing of table#sport) on some parsers because that is the closest open <td>. So that is why the parser reports only 2 <tr>s instead of 13. The other <tr>s you're expecting are now outside of table#sport.
Probably your best bet is to pass the html through an HTML tidying program/script (e.g. this one with the clean option enabled) first before passing it to cheerio. After that, your selector should return the elements you're probably expecting.
(new information below)
I am trying to set up a lambda function that reacts to uploaded tgz files by uncompressing them and writing the results back to S3. The unzip and untar work fine, but uploading to S3 fails:
/Users/russell/lambda/gzip/node_modules/aws-sdk/lib/s3/managed_upload.js:350
var buf = self.body.read(self.partSize - self.partBuffer.length) ||
^
TypeError: undefined is not a function
at ManagedUpload.fillStream (/Users/russell/lambda/gzip/node_modules/aws-sdk/lib/s3/managed_upload.js:350:25)
at Entry.<anonymous> (/Users/russell/lambda/gzip/node_modules/aws-sdk/lib/s3/managed_upload.js:167:28)
at Entry.emit (events.js:104:17)
at Entry._read (/Users/russell/lambda/gzip/node_modules/tar/lib/entry.js:123:12)
at Entry.end (/Users/russell/lambda/gzip/node_modules/tar/lib/entry.js:82:8)
at Parse._process (/Users/russell/lambda/gzip/node_modules/tar/lib/parse.js:107:13)
at BlockStream.<anonymous> (/Users/russell/lambda/gzip/node_modules/tar/lib/parse.js:47:8)
at BlockStream.emit (events.js:107:17)
at BlockStream._emitChunk (/Users/russell/lambda/gzip/node_modules/tar/node_modules/block-stream/block-stream.js:145:10)
at BlockStream.write (/Users/russell/lambda/gzip/node_modules/tar/node_modules/block-stream/block-stream.js:45:10)
This error occurs when I write to S3, but if instead I write the files locally to disk it works, so the pipeline is correct.
Here is code that demonstrates the problem:
var aws = require('aws-sdk');
var s3 = new aws.S3({apiVersion: '2006-03-01'});
var zlib = require('zlib');
var tar = require('tar');
var fstream = require('fstream');
fstream.Reader({'path': 'testdata.tar.gz'})
.pipe(zlib.Unzip())
.pipe(tar.Parse())
.on('entry', function(entry) {
var filename = entry.path;
console.log('got ' + entry.type + ' ' + filename);
if (entry.type == 'File') {
if (1) { // switch between working and nonworking cases
s3.upload({Bucket: 'my_bucket', Key: 'gunzip-test/' + filename, Body: entry}, {},
function(err, data) {
if (err)
console.log('ERROR!');
else
console.log('OK');
});
}
else {
entry.pipe(fstream.Writer({ 'path': '/tmp/mytest/' + filename }));
}
}
});
If the code is set to write to S3 it fails with the above error, if it writes the extracted files locally it succeeds. ENTRY is a stream, and according to the doc should be accepted in the upload Body parameter. I put a print statement in ManagedUpload, where the fail comes, and confirmed that self.body is a stream:
var stream = require('stream');
console.log('is it a stream? ' + ((self.body instanceof stream) ? 'yes' : 'no'));
console.log('self.body.read is ' + self.body.read);
returns
$ got File gunzip.js
is it a stream? yes
self.body.read is undefined
I'm pretty new with aws and node.js, so there could be a basic problem with this, but I've spent a day and haven't found it. I did the upload call with unzip instead of gzip and it worked (using lambda functions to unzip archives in S3 is really sloooooow) Can anyone point me at something I am doing wrong in this code?
Thanks
I think I understand this a little better. I broke the pipeline up into pieces and looked at each one. The problem is that tar.Parse uses fstream and not stream. If I look at the return of the .pipe(tar.Parse()) statement it is a stream, but it is not a stream.Readable or a stream.Writable. fstream does not define a read() method (its reader is based on Stream, it is not a stream.Readable), so tar.Parse, which is based on Stream, does not have one either.
So a refinement of the question is, is this a bug in fstream, or is fstream not intended to be a stream? I think it is a bug - from the README:
"Like FS streams, but with stat on them, and supporting directories and
symbolic links, as well as normal files. Also, you can use this to set
the stats on a file, even if you don't change its contents, or to create
a symlink, etc."
In my case running the stream through stream.PassThrough helped.
var PassThrough = require('stream').PassThrough;
var stream = getStreamSomeHow();
var passthrough = new PassThrough();
stream.pipe(passthrough);
s3.upload({...,Body:passthrough}) //
Your body variable is a Stream object, in which case you will need to use .toString()
var aws = require('aws-sdk');
var s3 = new aws.S3({apiVersion: '2006-03-01'});
var zlib = require('zlib');
var tar = require('tar');
var fstream = require('fstream');
fstream.Reader({'path': 'testdata.tar.gz'})
.pipe(zlib.Unzip())
.pipe(tar.Parse())
.on('entry', function(entry) {
var filename = entry.path;
console.log('got ' + entry.type + ' ' + filename);
if (entry.type == 'File') {
if (1) { // switch between working and nonworking cases
s3.upload({Bucket: 'my_bucket', Key: 'gunzip-test/' + filename, Body: entry.toString()}, {},
function(err, data) {
if (err)
console.log('ERROR!');
else
console.log('OK');
});
}
else {
entry.pipe(fstream.Writer({ 'path': '/tmp/mytest/' + filename }));
}
}
});
I made following, to play a bit around with node.js.
The files in the folder zipfiles are zipped accordingly and everything seems to work.
But I got an error on the cmd and I don't know where it comes from or how to solve it.
events.js:72
throw er; // Unhandled 'error' event
^
Error: write after end
at writeAfterEnd (_stream_writable.js:130:12)
at Gzip.Writable.write (_stream_writable.js:178:5)
at write (_stream_readable.js:583:24)
at flow (_stream_readable.js:592:7)
at ReadStream.pipeOnReadable (_stream_readable.js:624:5)
at ReadStream.EventEmitter.emit (events.js:92:17)
at emitReadable_ (_stream_readable.js:408:10)
at emitReadable (_stream_readable.js:404:5)
at readableAddChunk (_stream_readable.js:165:9)
at ReadStream.Readable.push (_stream_readable.js:127:10)
Here's my script:
var zlib = require('zlib');
var gzip = zlib.createGzip();
var fs = require('fs');
var zip = {
zipAll: function(dir){
//files to zip
fs.readdir(dir, function(err, data){
if(err) throw(err);
var arrayValue = data.toString().split(',');
//files with .gz at the end, needs to be excluded
for(var i=0; i<arrayValue.length; i+=1){
console.log("Zipping following files: " + arrayValue[i]);
var input = fs.createReadStream('zipfiles/' + arrayValue[i]);
var output = fs.createWriteStream('zipfiles/input'+[i]+'.txt'+'.gz');
input.pipe(gzip).pipe(output);
}
});
}
};
zip.zipAll('zipfiles');
Thanks
The Gzip object is a bit wonky (afaik undocumented) to reuse for multiple files. The easiest way to fix your problem is to simply use a separate gzip object per file to compress, something like;
for(var i=0; i<arrayValue.length; i+=1){
console.log("Zipping following files: " + arrayValue[i]);
var input = fs.createReadStream('zipfiles/' + arrayValue[i]);
var output = fs.createWriteStream('zipfiles/input'+[i]+'.txt'+'.gz');
input.pipe(zlib.createGzip()).pipe(output);
}
I'm using Node v0.10.11 on Ubuntu 12.04. I can't figure out what I'm missing to make streams of URLs work with the request module.
This program is trying to go to a mailing list site, find the download links for each month, then download the pages for each month.
Mikael's readme says "The first argument can be either an url or an options object. The only required option is URI, all others are optional.
uri || url - fully qualified uri or a parsed url object from url.parse()"
If I call url.parse(www.targeturl.com), I get
Error: options.uri is a required argument
If I don't use url.parse, I get
Error: Invalid URI "www.freelists.org/archive/si-list/06-2013"
(this link works perfectly fine in my browsers)
I've cut the code down to 42 lines. Any advice welcome
var request = require('request'),
url = require('url'),
stream = require('stream'),
cheerio = require('cheerio'), // a reduced jQuery style DOM library
Transform = require('stream').Transform
var DomStripStream = function(target) {
this.target = target;
stream.Transform.call(this,{objectMode: true});
}
DomStripStream.prototype = Object.create(
Transform.prototype, {constructor: {value: DomStripStream}}
)
DomStripStream.prototype.write = function () {
this._transform.apply(this, arguments);
};
DomStripStream.prototype.end = function () {
this._transform.apply(this, arguments);
this.emit("end");
};
DomStripStream.prototype._transform = function(chunk, encoding, callback) {
chunk = chunk ? chunk.toString() : "";
$ = cheerio.load(chunk);
domLinks = $(this.target);
$(domLinks).each(function (i, link) {
currLink = 'www.freelists.org' + $(link).attr('href')
// currLink = url.parse(currLink)
request(currLink, function (error, response, body) {
console.log(error);
})
});
}
var fs = require("fs"),
output = fs.createWriteStream("out.txt"),
mainPage = new DomStripStream('td a')
request('http://www.freelists.org/archive/si-list').
pipe(mainPage).
pipe(output);
add http:// or https:// in the url
I'm trying to make a simple HTTP GET request using node.js, but I'm running into trouble using node.js v0.3.4-pre (i.e. compiled from HEAD as of this morning). Here's my code:
var cli = require('cli');
var http = require('http');
var url = require('url');
cli.parse();
cli.main(function(args, opts) {
this.debug(args[0]);
var siteUrl = url.parse(args[0]);
var site = http.createClient(siteUrl.port, siteUrl.host);
console.log(siteUrl);
var request = site.request("GET", siteUrl.pathname, {'host' : siteUrl.host})
request.end();
request.on('response', function(response) {
response.setEncoding('utf8');
console.log('STATUS: ' + response.statusCode);
response.on('data', function(chunk) {
console.log("DATA: " + chunk);
});
});
});
Here's the error that I get:
node.js:68
throw e; // process.nextTick error, or 'error' event on first tick
^
Error: EAFNOSUPPORT, Address family not supported by protocol family
at doConnect (net.js:499:19)
at Client.connect (net.js:652:30)
at Client._ensureConnection (http.js:1033:10)
at Client.request (http.js:1048:8)
at Object.<anonymous> (/Users/paul/Desktop/readify.js:16:21)
at /usr/local/lib/node/.npm/cli/0.2.3-2/package/cli.js:995:18
at Object.main (/usr/local/lib/node/.npm/cli/0.2.3-2/package/cli.js:1000:9)
at Object.<anonymous> (/Users/paul/Desktop/readify.js:10:5)
at Module._compile (node.js:359:32)
at Object..js (node.js:367:14)
Found the bug, siteUrl.port will be undefined unless the URL explicitly names a port. So, the solution is:
var site = http.createClient(siteUrl.port || 80, siteUrl.host);
var site = http.createClient(siteUrl.port, siteUrl.host);
should rather be
var site = http.createClient(siteUrl.port || 80, siteUrl.hostname);
The same error message appeared on my very old XPSP2-box for ANY connect()-attempts. E.g npm wasn't able to do anything, and simple http requests failed.
While trying to find a solution, this post appeared all over the place, but its not the same issue.
In my case it had to do with WSAIoctl(...) always returning WSAEOPNOTSUPP when querying for WSAID_CONNECTEX, which seemed strange. This led me to a post recommending doing a "netsh winsock reset" from the cmd, which fixed the problem!