I'm experimenting with Node.js and web scraping. In this case, I'm trying to scrape the most recent songs from a local radio station for display. With this particular website, body returns nothing. When I try using google or any other website, body has a value.
Is this a feature of the website I'm trying to scrape?
Here's my code:
var request = require('request');
var url = "http://www.radiomilwaukee.org";
request(url, function(err,resp,body) {
if (!err && resp.statusCode == 200) {
console.log(body);
}
else
{
console.log(err);
}
});
That's weird, the website you're requesting doesn't seem to return anything unless the accept-encoding header is set to gzip. With that in mind, using this gist will work: https://gist.github.com/nickfishman/5515364
I ran the code within that gist, replacing the URL with "http://www.radiomilwaukee.org" and see the content within the sample.html file once the code has completed.
If you'd rather have access to the web page's content within the code, you could do something like this:
// ...
req.on('response', function(res) {
var body, encoding, unzipped;
if (res.statusCode !== 200) throw new Error('Status not 200');
encoding = res.headers['content-encoding'];
if (encoding == 'gzip') {
unzipped = res.pipe(zlib.createGunzip());
unzipped.on("readable", function() {
// collect the content in the body variable
body += unzipped.read().toString();
});
}
// ...
Related
I (maybe falsely) assumed lambda#edge can modify origin.responce content,
so wrote a lambda function like this:
/* this does not work. response.Body is not defined */
'use strict';
exports.handler = (event, context, callback) => {
var response = event.Records[0].cf.response;
var data = response.Body.replace(/OLDTEXT/g, 'NEWTEXT');
response.Body = data;
callback(null, response);
};
Which fails because you can not reference origin responce body with this syntax.
Can I modify this script to make it work as I intended, or maybe should I consider using another service on AWS?
My background :
We are trying to set up an AWS Cloudfront distribution, that consolidates access to several websites, like this:
ttp://foo.com/ -> https:/newsite.com/foo/
ttp://bar.com/ -> https:/newsite.com/bar/
ttp://boo.com/ -> https:/newsite.com/boo/
the sites are currently managed by external parties. We want to disable direct public access to foo/bar/boo, and have just newsite.com as the only site visible on the internet.
Mapping the origins into a single c-f distribution is relatively simple.
however doing so will break html contents that specify files with an absolute url,
if their current domain names are removed from the web.
ttp://foo.com/images/1.jpg
-> (disable foo.com dns)
-> image not found
to benefit from cloudfront caching and other merits,
I want to modify/rewrite all absolute file references in html files to a relative url -
so
<img src="ttp://foo.com/images/1.jpg">
becomes
<img src="/foo/images/1.jpg">
//(accessed as https:/newsite.com/foo/images/1.jpg from a user)
//(maybe I should make it an absolte url for SEO purpose)
(http is changed to ttp, due to restriction of using the banned domain name foo.com)
(edit)
I found this AWS blog, which may be a great hint but feel a little too convoluted to my expectation. (set up a linux container so I can just use sed to process html files, maybe using S3 as a temp storage)
Hope I can find a simpler way:
https://aws.amazon.com/blogs/networking-and-content-delivery/resizing-images-with-amazon-cloudfront-lambdaedge-aws-cdn-blog/
From what I have just learnt myself you unfortunately cannot modify the response body within a Lambda#edge. You can only wipe out or totally replace the body content. I was hoping to be able to clean all responses from a legacy site, but using a Cloudfront Lambda#Edge will not allow this to be done.
As the AWS documentation states here :
When you’re working with the HTTP response, Lambda#Edge does not expose the body that is returned by the origin server to the origin-response trigger. You can generate a static content body by setting it to the desired value, or remove the body inside the function by setting the value to be empty. If you don’t update the body field in your function, the original body returned by the origin server is returned back to viewer.
I ran into the same issue, and have been able to pull some info out of the request headers to piece together a URL from which I can fetch the original body.
Beware: I haven't yet been able to confirm that this is a "safe" method, like maybe it's relying on undocumented behaviour etc, but for now it DOES fetch the original body properly, for me. Of course it also takes another request / round trip, possibly inferring some extra transfer costs, execution time, etc.
const fetchOriginalBody = (request) => {
const host = request['headers']['host'][0]['value']; // xxxx.yyy.com
const uri = request['uri'];
const fetchOriginalBodyUrl = 'https://' + host + uri;
return httpsRequest(fetchOriginalBodyUrl);
}
// Helper that turns https.request into a promise
function httpsRequest(options) {
return new Promise((resolve, reject) => {
const req = https.request(options, (res) => {
if (res.statusCode < 200 || res.statusCode >= 300) {
return reject(new Error('statusCode=' + res.statusCode));
}
var body = [];
res.on('data', function(chunk) {
body.push(chunk);
});
res.on('end', function() {
try {
body = Buffer.concat(body).toString();
// body = JSON.parse(Buffer.concat(body).toString());
} catch(e) {
reject(e);
}
resolve(body);
});
});
req.on('error', (e) => {
reject(e.message);
});
req.end();
});
}
exports.handler = async (event, context, callback) => {
const records = event.Records;
if (records && records.length > 0) {
const request = records[0].cf.request;
const body = await fetchOriginalBody(request);
}
...
I am using this API to consume a ESPN API:
http://api.espn.com/v1/now?apikey=d4skkma8kt2ac8tqbusz38w6
In Node.js, using node-curl library, my snippet looks like this:
var Curl = require('node-curl/lib/Curl')
curl.setopt('URL', url);
curl.setopt('CONNECTTIMEOUT', 2);
curl.on('data', function(chunk) {
console.log(chunk);
});
But everytime when I run this code I keep on getting response as:
<h1>596 Service Not Found</h1>
Strange is, same URL if I hit from the browser I get the correct response so URL is not invalid. This happens only if I try to call from Node.js. Can anyone guide me how to resolve this error? I tried encoding/decoding url, still I get same response.
Also basically I am avoiding any vendor specific libraries as much as possible, since we should have generic api calling framework
You could use request, which is a very popular module. Here's an example of how you could do it:
var request = require('request');
var url = 'http://api.espn.com/v1/now?apikey=d4skkma8kt2ac8tqbusz38w6';
request(url, function (error, response, body) {
if (!error && response.statusCode == 200) {
console.log(JSON.parse(body));
}
});
How to check if youtube video exists on node.js app server side:
var youtubeId = "adase268_";
// pseudo code
youtubeVideoExist = function (youtubeId){
return true; // if youtube video exists
}
You don't need to use the youtube API per-se, you can look for the thumbnail image:
Valid video = 200 - OK:
http://img.youtube.com/vi/gC4j-V585Ug/0.jpg
Invalid video = 404 - Not found:
http://img.youtube.com/vi/gC4j-V58xxx/0.jpg
I thought I could make this work from the browser since you can load images from a third-party site without security problems. But testing it, it's failing to report the 404 as an error, probably because the content body is still a valid image. Since you're using node, you should be able to look at the HTTP response code directly.
I can't think of an approach that doesn't involve making a separate HTTP request to the video link to see if it exists or not unless you know beforehand of a set of video IDs that are inactive,dead, or wrong.
Here's an example of something that might work for you. I can't readily tell if you're using this as a standalone script or as part of a web server. The example below assumes the latter, assuming you call a web server on /video?123videoId and have it respond or do something depending on whether or not the video with that ID exists. It uses Node's request library, which you can install with npm install request:
var request = require('request');
// Your route here. Example on what route may look like if called on /video?id=123videoId
app.get('/video', function(req, response, callback){
var videoId = 'adase268_'; // Could change to something like request.params['id']
request.get('https://www.youtube.com/watch?v='+videoId, function(error, response, body){
if(response.statusCode === 404){
// Video doesn't exist. Do what you need to do here.
}
else{
// Video exists.
// Can handle other HTTP response codes here if you like.
}
});
});
// You could refactor the above to take out the 'request.get()', wrap it in a function
// that takes a callback and re-use in multiple routes, depending on your problem.
#rodrigomartell is on the right track, in that your check function will need to make an HTTP call; however, just checking the youtube.com URL won't work in most cases. You'll get back a 404 if the videoID is a malformed ID (i.e. less than 11 characters or using characters not valid in their scheme), but if it's a properly formed videoID that just happens to not correspond to a video, you'll still get back a 200. It would be better to use an API request, like this (note that it might be easier to use the request-json library instead of just the request library):
request = require('request-json');
var client = request.newClient('https://www.googleapis.com/youtube/v3/');
youtubeVideoExist = function (youtubeId){
var apikey ='YOUR_API_KEY'; // register for a javascript API key at the Google Developer's Console ... https://console.developers.google.com/
client.get('videos/?part=id&id='+youtubeId+'&key='+apikey, function(err, res, body) {
if (body.items.length) {
return true; // if youtube video exists
}
else {
return false;
}
});
};
Using youtube-feeds module. Works fast (~200ms) and no need API_KEY
youtube = require("youtube-feeds");
existsFunc = function(youtubeId, callback) {
youtube.video(youtubeId, function(err, result) {
var exists;
exists = result.id === youtubeId;
console.log("youtubeId");
console.log(youtubeId);
console.log("exists");
console.log(exists);
callback (exists);
});
};
var notExistentYoutubeId = "y0srjasdkfjcKC4eY"
existsFunc (notExistentYoutubeId, console.log)
var existentYoutubeId = "y0srjcKC4eY"
existsFunc (existentYoutubeId, console.log)
output:
❯ node /pathToFileWithCodeAbove/FileWithCodeAbove.js
youtubeId
y0srjcKC4eY
exists
true
true
youtubeId
y0srjasdkfjcKC4eY
exists
false
false
All you need is to look for the thumbnail image. In NodeJS it would be something like
var http = require('http');
function isValidYoutubeID(youtubeID) {
var options = {
method: 'HEAD',
host: 'img.youtube.com',
path: '/vi/' + youtubeID + '/0.jpg'
};
var req = http.request(options, function(res) {
if (res.statusCode == 200){
console.log("Valid Youtube ID");
} else {
console.log("Invalid Youtube ID");
}
});
req.end();
}
API_KEY is not needed. It is quite fast because there is only header check for statusCode 200/404 and image is not loaded.
I am trying to get all of the links in a subreddit using the API, but it is only returning one url. Here is the code I have:
var request = require('request');
webpage = 'http://www.reddit.com/r/AmazonUnder5/top.json?limit=100';
//login
request.post('http://www.reddit.com/api/login',{form:{api_type:'json', passwd:'password', rem:true, user:'username'}});
//get urls
request({uri : webpage, json:true, headers:{useragent: 'mybot v. 0.0.1'}}, function(error, response, body) {
if(!error && response.statusCode == 200) {
for(var key in body.data.children) {
var url = body.data.children[key].data.url;
console.log(url);
}
}
});
When I visit the json link in my browser, it returns all 100 posts.
Thats because only 1 exists in the top
http://www.reddit.com/r/AmazonUnder5/top
You could use hot instead
http://www.reddit.com/r/AmazonUnder5/hot.json
Also, you don't need to log in to do public get requests
Edit: You are getting so few results because you are not logged in properly
When logging in, use the
"op" => "login"
Parameter and test what cookies and data is returned.
I also recommend using the ssl login url since that works for me
https://ssl.reddit.com/api/login/
From node, I'd like to get all image urls (src attribute from img tags) from an external web page.
I started by considering phantonjs, but didn't like that it's not really integrated into node (i.e. it runs in an external process).
Next, I tried to use the request module and cheerio. This works great, except I have to deal with relative image urls. E.g.
<img src='http//example.com/i.jpg'>
<img src='/i.jpg'>
<img src='i.jpg'>
<img src='../images/i.jpg'>
I can deal with that, but I'm wondering if there's an easier way?
I ended up using the request node module along with cheerio and url. Here's what I ended up doing (please note, this is mvp code, not production quality):
app.get('/scrape-images', function(req, res) {
request(req.query.url, function (error, response, body) {
if (!error && response.statusCode == 200) {
var $ = cheerio.load(body);
var reqUrl = url.parse(req.query.url);
res.send($('img').map(function(i, e) {
var srcUrl = url.parse($(e).attr('src'));
if (!srcUrl.host) {
return url.resolve(reqUrl, srcUrl);
} else {
return url.format(srcUrl);
}
}));
}
});
});