I am trying to scrape the below mentioned URL and trying to extract the link in the tag,using PhantomJS.
The link sends back javascript code which is executed on the browser to form the HTML inside the Body.
The problem is PhantomJS is not executing the Javascript and I am getting the unfinished HTML after executing my code.
Index.js
var page = require('webpage').create();
console.log('The default user agent is ' + page.settings.userAgent);
page.settings.userAgent = 'SpecialAgent';
page.open('http://videohost.site/play/A11QStEaNdVZfvV/', function(status) {
if (status !== 'success') {
console.log('Unable to access network');
} else {
var html = page.evaluate(function() {
return document.getElementsByTagName('body')[0].innerHTML;
});
console.log(html);
}
phantom.exit();
});
I execute it using phantomjs index.js in the console t check the HTML.
Any help regarding how would the script inside the would be executed would be much appreciated.
Update:
works with NightMareJS as according to Vaviloff's suggestion.
app.get('/scrape', function (req, res) {
//All the web scraping magic will happen here
console.log('Scrape')
url = 'http://videohost.site/play/A11QStEaNdVZfvV/'
nightmare
.goto(url)
.evaluate(function(){
return document.body.innerHTML; //pass all of the html as text
})
.end()
.then(function (document) {
console.log(document)
})
.catch(function (error) {
console.error('Search failed:', error);
})
res.send('Woohoo')
})
app.listen('8081')
console.log('Magic happens on port 8081');
exports = module.exports = app;
Related
I have a react project with this nodeJS server:
(basically catch the request and modify some meta-tags for the blog post data , not something huge):
// here we serve the index.html page with the meta tags
app.get("/blog/post/:id", async (req, res, next) => {
//get post info
const postId = req.params.id;
const { postTitle, postDesc } = await fetchBlogPost(postId);
fs.readFile(indexPath, "utf8", (err, htmlData) => {
if (err) {
console.error("Error during file reading", err);
return res.status(404).end();
}
if (postTitle && postDesc) {
htmlData = htmlData
.replace(
"<title>my title</title>",
`<title>${postTitle}</title>`
)
.replace("__META_OG_TITLE__", postTitle)
.replace("__META_OG_DESCRIPTION__", postDesc)
.replace("__META_DESCRIPTION__", postDesc);
return res.send(htmlData);
}
});
});
However - when the user is reaching /blog/post/ID , the server runs and fetch the post data while using the await method - which cause the user to see a blank white page for 2-3 seconds untill the server returns the html.
Is there anything that i can do about it ? i would like the user maybe to see some loading before, because right now its a blank white page.
In the server script I try to deliver different html files. When app.post('/login'...) comes in, res.sendFile() is working and the html gets rendered. On the second call, whenn app.get('/go') comes in, the file gets served, but not displayed. I cannot explain why the second HTML file is not displayed. What am I doing wrong?
the second request comes from a fetch request in a javascript
socket.on('gameStarted', (data) => {
console.log("Game started");
fetch('/go', {method: 'GET'});
})
served but not displayed
app.post('/login', async (req, res, next) => {
var roomNR = req.body.player.gameCode;
var playerName = req.body.player.nickname;
var codeValid = await checkCode(activeRoomsCollection, gameCodes, roomNR);
var playerExists = await playerCollection.findOne({ playerName: playerName })
if (codeValid) {
if ((playerExists === null) || !playerExists) {
playerCollection.insertOne({ room: roomNR, playerName: playerName, state: false });
console.log(`Added player '${playerName}' with roomnumber '${roomNR}'`);
res.sendFile(path.join(__dirname + '/../../public/lobby.html'), function (err) {
if (err) {
console.log(err);
res.status(err.status).end();
}
else {
console.log('Sent Lobby');
}
});
} else {
// updateDomElement(player, elementId, data)
//res.send('Benutzername existiert bereits');
}
} else {
res.send('Code ungültig');
}
});
app.get('/go', (req, res, next ) => {
res.sendFile(path.join(__dirname + '/../../public/raetsel1.html'), function (err) {
if (err) {
console.log(err);
res.status(err.status).end();
}
else {
console.log('Sent Raetsel1');
}
});
});
fetch() never displays anything on its own. It's a way for your Javsascript to issue http requests to remote servers and those servers then return content back to your Javascript. The result from those http requests ONLY goes to your Javascript. Nothing in the view of the page is affected at all by a fetch() call.
If you want the result of a fetch() call to display something in your page, you would need to write Javascript to do that (to insert content into the current page).
If, instead, you just want the browser to go to a new page, then change from this:
fetch('/go', {method: 'GET'});
to this:
window.location = "/go";
This will cause the browser to go to the URL, retrieve the content and display it. This will shut-down the current page and load and display a new page and the URL in the URL-bar in the browser will show the updated location.
Note that if you have socket.io code in both pages, it will disconnect the current socket.io connection and then run the Javascript in the new page - causing it to create a new socket.io connection (if you have code in the new page to do that) as that is what happens to socket.io connections when you load and display a new web page in the browser.
Im trying to refresh a clients webpage (using a router) and everywhere I look I see something along the lines of using res.redirect(same page link of some sort), however for some reason this isnt working for me, do you know of any alternatives?
my code looks something like this
router.post('/sendSnippet', function (req, res) {
req.on('data', function(data) {
User.findOne({email: req.user.email}).then((userToEdit) =>{
if(userToEdit){
var newSnippet = {
"types":[],
"code": data.toString()
}
userToEdit.snippets.push(newSnippet)
userToEdit.save().then(()=>{
//refresh here
res.redirect('/profile/');
})
}
})
})
});
thanks for any help in advance
Assuming you are trying to force-reload a client's browser tab for your website, I don't think you can do that server-side.
You can use meta http-equiv or the Refresh HTTP header to tell the client's browser to refresh the page after some time or use client javascript to refresh the page:
Router:
router.post("/example", (req, res) => {
res.header("Refresh", "10"); // tells the browser to refresh the page after 10 seconds
res.send("your data");
});
Meta:
<head>
<!-- Refresh after 10 seconds -->
<meta http-equiv="refresh" content="10">
</head>
Javascript + html:
<html>
<body>
<script>
// reloads after 10 seconds
setTimeout(() => {
location.reload();
}, 10000);
// or you could have some kind of API to tell when to refresh the page
function check() {
const x = new XMLHttpRequest();
x.open("GET", "some path");
x.send();
x.onload = function() {
if (x.response === "done") {
location.reload();
} else {
setTimeout(check, 1000);
}
}
}
check();
</script>
</body>
</html>
I have a NodeJS App with Socket Io integration. Now my web page and app both has been implemented but I am facing one issue during execution:
Below is my web page code:
<script>
$(document).ready(function(){
$("#batch")[0].reset();
var socket = io.connect('http://xx.xx.xxx.xx:xxxx',{'forceNew':true });
socket.on('message', function (data) {
var newtext = data;
document.batch.outputtext.value += newtext;
});
socket.on('end', function (data) {
socket.disconnect();
});
});
</script>
And my NodeJS App:
exec_script = function(resp) {
socket.on('connection', function (client) {
console.log('Connection Established');
client.on('disconnect', function () {
console.log('disconnected');
return;
});
var pyshell = new PythonShell('./test.py', options ={ mode: 'text', pythonOptions: ['-u'] });
pyshell.stdout.on('data', function(data) {
client.emit('message', data);
console.log(data);
});
pyshell.end(function(err) {
if (err) throw err;
console.log('End Script');
client.emit('end', 'end');
client.disconnect();
});
});
};
The issue I am facing is that when the Python scripts executes its output is send to browser while my browser status states "Waiting for xx.xx.xxx.xx" and in my FF I see the blue circle circling - that's fine - but even after the Python script has ended and socket disconnected explicitly I still see the browser status as "Waiting for xx.xx.xxx.xx browser title as Connecting with the blue circle rotating?
How can I close and end the connection successfully with the fact that I need the same page in the browser that is I would not navigate the user to some other page?
I tried by using response.end() but the issue I am facing is that if the request data was posted as URL form /today then calling response.end() changes the URL at browser side to http://xx.xx.xxx.xx:xxxx/today leading to a blank / error page which is what I a do not want in my case - the URL should remain as http://xx.xx.xxx.xx:xxxx?
Below is the method I am calling my exec_script method:
router.post('/', function(req, res) {
methods.process(req, res);
});
exports.process = function(req, resp) {
var bname = req.body['date'];
if(typeof req.body['date'] !== "undefined" && req.body['date'] !== null)
{
exec_script(req, resp);
}
};
var content = 'Hi, welcome to my webpage.';
var options = {host: 'www.website.com', path: '/folder/song.mp3', agent:false};
http.get(options, function(r) {
r.on('data', function() {
if (r.statusCode !== '404') {
content += 'Download';
}
});
});
fs.writeFile('index.html', content);
So normally, if I want to write a static html webpage from a node.js script, this works perfectly. For some reason, however, if I try to append to content from within http.get, it doesn't work. The whole point is to check if the file/page exists from an external website, and if it does then to display a link to it. The code that checks for the existing file works just fine, but I can't append anything to an external variable it seems. Any help would be much appreciated.
You need to use end event of http to indicate when you finish receiving data.
As node is asynchronouse the fs.writeFile instruction is run before all your data is received.
Here's how you can do it:
http.get(options, function(r) {
r.on('data', function() {
if (r.statusCode !== '404') {
content += 'Download';
}
});
r.on('end', function() {
fs.writeFile('index.html', content);
});
});