PDF.js - split pdf into pages and re-build multiple files - node.js

I am currently working on a Node.js project. One of the actions required is to read the text of a pdf document and then split the document into separate files.
As I have been using pdf.js for all other pdf parsing in this project, I was hoping to complete the above requirement using it as well.
Reading the PDF and its text content is relatively straightforward.
For example -
function GetWords(pdfUrl){
var pdf = PDFJS.getDocument(pdfUrl);
return pdf.then(function(pdf) { // calculate total count for document
var maxPages = pdf.pdfInfo.numPages;
var countPromises = []; // collecting all page promises
for (var j = 1; j <= maxPages; j++) {
var page = pdf.getPage(j);
var txt = "";
countPromises.push(page.then(function(page) { // add page promise
var textContent = page.getTextContent();
return textContent.then
(
function(page)
{ // return content promise
for(var i=0;i<page.items.length;i++)
{
var txtadd = page.items[i].str
txt += txtadd.replace(/[^a-zA-Z0-9:;,.?!-() ]/g,'');
}
return txt.split(" ").length; // value for page words
});
}));
}
// Wait for all pages and sum counts
return Promise.all(countPromises).then(function (counts) {
var count = 0;
//counts.forEach(function (c) { count += c; });
return count;
});
});
}
However, I can't seem to find any examples of building a PDF from one / or more of its pages. Ideally, I would want to use the pdf.GetPage(j) to get an array of the pages required. Then push these into a new document and save this new document to disk.
Any help would be appreciated.

I ended up using a separate library to perform the splitting. http://pdfhummus.com/. So in combination with the PDF.js i was able to get the desired result.

Related

Wait for a function to create modified Array

I'm writing React app. After clicking one button, I want the file to be downloaded. Before that, the array that I have has to be modified in order to have the downloaded report in proper format.
The problem I have is that I don't know how to force getReports() to wait for setInOrder() to process the data. Therefore code doesn't enter the loop.
export const setInOrder = async (objects) => {
var sortedObjectsAll = new Object();
for (let i = 0; i < objects.length; ++i) {
if (sortedObjectsAll.hasOwnProperty(objects[i].addedBy)) {
sortedObjectsAll[objects[i].addedBy].push(objects[i]);
} else {
sortedObjectsAll[objects[i].addedBy] = new Array();
}
}
return sortedObjectsAll
}
export const getReports = async (objects) => {
const sortedObjectsAll = await setInOrder(objects) // This is correct but not available instantly
console.log(sortedObjectsAll) // this is correctly printed
const reports = new Array();
for (let j = 0; j < sortedObjectsAll.length; ++j) {
console.log("Never enters here")
reports.push(createReport(sortedObjectsAll[j]))
}
return reports
}
I'm trying to use await or async somehow, but can't solve it. I see some Promises advised but I don't know how to really return the resulting variable to the code that actually downloads the report.
First you do not need to write an async-await something like that, because it is not an async operation (and if you write one and do not have any await in it, it will wait for nothing).
Second you want to iterate through an object, and not through an array, and that is the problem. Replace with the following (there are other solutions as well):
for (const key in sortedObjectsAll) {
...
}

Test random URLs from spreadsheet using alasql

I have a large number of URLs within a xlsx file. What I'd like to do is randomly select some of these URLs, load them, then check that they return a status code of 200.
So I'm using the npm alasql package to do this.
At the moment, the following code successfully loads the first 5 URLs in the spreadsheet, checks that they 200, then finishes the test.
var alasql = require('alasql');
var axios = require('axios');
module.exports = {
'#tags': ['smokeTest'],
'Site map XML pages load': async (browser) => {
const result = await alasql.promise('select URL from xlsx("./testUrls.xlsx",{sheetid:"RandomUrls"})');
var xcelData = result.map(item => {
return item.URL;
});
async function siteMapUrlsTestArr(item) {
var response = await axios.get(browser.launch_url + item);
browser.verify.equal(response.status, 200);
console.log('Sitemap test URL =', (browser.launch_url + item));
}
for (let index = 0; index < xcelData.length; index++) {
if (index < 5) {
const xmlTestUrl = xcelData[index];
await siteMapUrlsTestArr(xmlTestUrl);
} else {}
}
},
'Closing the browser': function (browser) {
browser.browserEnd();
},
};
However, what I'd like to do is randomly select 5 URLs from the (large) list of URLs in the spreadsheet, rather than the first 5 URLs.
I appreciate that this will (probably) include using the Math.floor(Math.random() command, but I can't seem to get it to work no matter where I place this command.
Any help would be greatly appreciated. Thanks.
Your logic is flawed. Here's how.
You want to select 5 random URLs from a list and then, perform the operation on the items but what you're doing is you're getting all the items and running the operation using a loop on first five.
To correct it:
//Fixed to five as you only want to test 5 URLs.
for (let index = 0; index < 5; index++) {
//Selecting a Random item from the list using Math.random();
const xmlTestUrl = xcelData[Math.floor(Math.random() * xcelData.length)];
//Performing the HTTP response operation on it.
await siteMapUrlsTestArr(xmlTestUrl);
}
The aforementioned solution will select a random item in each loop and perform the HTTP response operation on it. The items will be randomly selected using Math.random().

Using socketio-file-upload to upload multiple files

Im using NodeJS with socket.io and socketio-file-upload to upload multiple files, it works great! However I'm having an issue where I'm trying to save the name attribute of the input these files come to save them into my DB.
When I upload 1 or more files, I can't seem to access the input field name or something that shows me which of the files come from which input field.
Here is my front:
var uploader = new SocketIOFileUpload(socket);
var array_files_lvl_3 = [
document.getElementById("l3_id_front"),
document.getElementById("l3_id_back"),
document.getElementById("l3_address_proof_1"),
document.getElementById("l3_address_proof_2"),
document.getElementById("l3_passport")
];
uploader.listenOnArraySubmit(document.getElementById("save_level_3"), array_files_lvl_3);
And here is my back:
var uploader = new siofu();
uploader.dir = "uploads/userL3";
uploader.listen(socket);
uploader.on('saved', function(evnt){
console.log(evnt);
//this "event" variable has a lot of information
//but none of it tells me the input name where it came from.
});
This is what the "evnt" variable holds:
Unfortunately the library doesn't send that information. So there is nothing existing config you can do. So this needs code modification.
client.js:374
var _fileSelectCallback = function (event) {
var files = event.target.files || event.dataTransfer.files;
event.preventDefault();
var source = event.target;
_baseFileSelectCallback(files, source);
client.js:343
var _baseFileSelectCallback = function (files, source) {
if (files.length === 0) return;
// Ensure existence of meta property on each file
for (var i = 0; i < files.length; i++) {
if (source) {
if (!files[i].meta) files[i].meta = {
sourceElementId: source.id || "",
sourceElementName: source.name || ""
};
} else {
if (!files[i].meta) files[i].meta = {};
}
}
After these changes I am able to get the details in event.file.meta
I'm the author of socketio-file-upload.
It looks like the specific input field is not currently being recorded, but this would not be a hard feature to add. Someone opened a new issue and left a backpointer to this SO question.
A workaround would be to directly use submitFiles instead of listenOnArraySubmit. Something like this might work (untested):
// add a manual listener on your submit button
document.getElementById("save_level_3").addEventListener("click", () => {
let index = 0;
for (let element of array_files_lvl_3) {
let files = element.files;
for (let file of files) {
file.meta = { index };
}
uploader.submitFiles(files);
index++;
}
});

Add a mongo request into a file and archive this file

I'm having some troubles while trying to use streams with a MongoDB request. I want to :
Get the results from a collection
Put this results into a file
Put this file into a CSV
I'm using the archiver package for the file compression. The file contains csv formatted values, so for each row I have to parse them in the CSV format.
My function take a res (output) parameters, which means that I can send the result to a client directly. For the moment, I can put this results into a file without streams. I think I'll get memory troubles for a large amount of data that's why I want to use streams.
Here is my code (with no stream)
function getCSV(res,query) {
<dbRequest>.toArray(function(err,docs){
var csv = '';
if(docs !== null){
for(var i = 0; i< docs.length; i++){
var line = '';
for(var index in docs[i]){
if(docs[i].hasOwnProperty(index) && (index !== '_id' ) ){
if(line !== '') line+= ',';
line += docs[i][index];
}
}
console.log("line",line);
csv += line += '\r\n';
}
}
}.bind(this));
fileManager.addToFile(csv);
archiver.initialize();
archiver.addToArchive(fileManager.getName());
fileManager.deleteFile();
archiver.sendToClient(res);
};
Once the csv is completed, I had it to a file with a Filemanager Object. The latter one handles file creation and manipulation. The addToArchive method add the file to the current archive, and the sendToClient method send the archive through the output (res parameter is the function).
I'm using Express.js so I call this method with a server request.
Sometimes the file contains data, sometimes it is empty, could you explain me why ?
I'd like to understand how streams works, how could I implement this to my code ?
Regards
I'm not quite sure why you're having issue with the data sometimes showing up, but here is a way to send it with a stream. A couple of points of info before the code:
.stream({transform: someFunction})
takes a stream of documents from the database and runs whatever data manipulation you want on each document as it passes through the stream. I put this function into a closure to make it easier to keep the column headers, as well as allow you to pick and choose which keys from the document to use as columns. This will allow you to use it on different collections.
Here is the function that runs on each document as it passes through:
// this is a closure containing knowledge of the keys you want to use,
// as well as whether or not to add the headers before the current line
function createTransformFunction(keys) {
var hasHeaders = false;
// this is the function that is run on each document
// as it passes through the stream
return function(document) {
var values = [];
var line;
keys.forEach(function(key) {
// explicitly use 'undefined'.
// if using !key, the number 0 would get replaced
if (document[key] !== "undefined") {
values.push(document[key]);
}
else {
values.push("");
}
});
// add the column headers only on the first document
if (!hasHeaders) {
line = keys.join(",") + "\r\n";
line += values.join(",");
hasHeaders = true;
}
else {
// add the line breaks at the beginning of each line
// to avoid having an extra line at the end
line = "\r\n";
line += values.join(",");
}
// return the document to the stream and move on to the next one
return line;
}
}
You pass that function into the transform option for the database stream. Now assuming you have a collection of people with the keys _id, firstName, lastName:
function (req, res) {
// create a transform function with the keys you want to keep
var transformPerson = createTransformFunction(["firstName", "lastName"]);
// Create the mongo read stream that uses your transform function
var readStream = personCollection.find({}).stream({
transform: transformPerson
});
// write stream to file
var localWriteStream = fs.createWriteStream("./localFile.csv");
readStream.pipe(localWriteStream);
// write stream to download
res.setHeader("content-type", "text/csv");
res.setHeader("content-disposition", "attachment; filename=downloadFile.csv");
readStream.pipe(res);
}
If you hit this endpoint, you'll trigger a download in the browser and write a local file. I didn't use archiver because I think it would add a level of complexity and take away from the concept of what's actually happening. The streams are all there, you'd just need to fiddle with it a bit to work it in with archiver.

Check if a document exists in mongoose (Node.js)

I have seen a number of ways of finding documents in mongoDB such that there is no performance hit, i.e. you don't really retrieve the document; instead you just retrieve a count of 1 or 0 if the document exists or not.
In mongoDB, one can probably do:
db.<collection>.find(...).limit(1).size()
In mongoose, you either have callbacks or not. But in both cases, you are retrieving the entries rather than checking the count. I simply want a way to check if a document exists in mongoose — I don't want the document per se.
EDIT: Now fiddling with the async API, I have the following code:
for (var i = 0; i < deamons.length; i++){
var deamon = deamons[i]; // get the deamon from the parsed XML source
deamon = createDeamonDocument(deamon); // create a PSDeamon type document
PSDeamon.count({deamonId: deamon.deamonId}, function(error, count){ // check if the document exists
if (!error){
if (count == 0){
console.log('saving ' + deamon.deamonName);
deamon.save() // save
}else{
console.log('found ' + deamon.leagueName);
}
}
})
}
You have to read about javascript scope. Anyway try the following code,
for (var i = 0; i < deamons.length; i++) {
(function(d) {
var deamon = d
// create a PSDeamon type document
PSDeamon.count({
deamonId : deamon.deamonId
}, function(error, count) {// check if the document exists
if (!error) {
if (count == 0) {
console.log('saving ' + deamon.deamonName);
// get the deamon from the parsed XML source
deamon = createDeamonDocument(deamon);
deamon.save() // save
} else {
console.log('found ' + deamon.leagueName);
}
}
})
})(deamons[i]);
}
Note: Since It includes some db operation, I am not tested.
I found it simpler this way.
let docExists = await Model.exists({key: value});
console.log(docExists);
Otherwise, if you use it inside a function, make sure the function is async.
let docHandler = async () => {
let docExists = await Model.exists({key: value});
console.log(docExists);
};
You can use count, it doesn't retrieve entries. It relies on mongoDB's count operation which:
Counts the number of documents in a collection.
Returns a document that contains this count and as well as the command status.

Resources