How to extract text from docx file with Nodejs

How to extract text from docx file with Nodejs - node.js

i want to extract text from docx file, i have tried using mammoth
var mammoth = require("mammoth");
mammoth.extractRawText({path: "./doc.docx"})
.then(function(result){
var text = result.value; // The raw text
//this prints all the data of docx file
console.log(text);
for (var i = 0; i < text.length; i++) {
//this prints all the data char by char in separate lines
console.log(text[i]);
}
var messages = result.messages;
})
.done();
but the problem here is that in this for loop i want data line by line instead of char by char, please help me here or is there any other method that you know?

One method is to fetch the whole text and then split by '\n':
import superagent from 'superagent';
import mammoth from 'mammoth';
const url = 'http://www.ojk.ee/sites/default/files/respondus-docx-sample-file_0.docx';
const main = async () => {
const response = await superagent.get(url)
.parse(superagent.parse.image)
.buffer();
const buffer = response.body;
const text = (await mammoth.extractRawText({ buffer })).value;
const lines = text.split('\n');
console.log(lines);
};
main().catch(error => console.error(error));

You can use any-text
Usage is simeple:
var reader = require('any-text');
reader.getText(`path-to-file`).then(function (data) {
console.log(data);
});

var mammoth = require("mammoth");
var path = require("path");
var filePath = path.join(__dirname,'./doc.docx');
mammoth.extractRawText({path: filePath})
.then(function(result){
var text = result.value; // The raw text
//this prints all the data of docx file
//console.log(text);
console.log('------------------------------');
var textLines = text.split ("\n");
for (var i = 0; i < textLines.length; i++) {
//this prints all the data in separate lines
console.log(textLines[i]);
}
var messages = result.messages;
})
.done();

Related

How to read remove some lines from json file using node.js

I want to extract some comment lines of data from json file and after making those into certain format i need to remove those. I am explaining my json file below.
test.json:
#PATH:/test/
#DEVICES:div1
#TYPE:p1
{
name:'Raj',
address: {
city:'bbsr'
}
}
The above file is one test json file. Here I need to read the lines which includes # at beginning and then make them separate object like const obj ={PATH:'/test/',DEVICES:'div1',TYPE:p1} and finally remove those 3 line from original test.json file using node.js

Following snippet can be used to solve this problem. It can handle any number of similar comments.
const fs = require('fs')
const readline = require('readline');
const filepath = '/Users/.../nodeText.json'; // file to read from
const writePath = '/Users/.../nodeFileResult.json' // file to write to
const inputStream = fs.createReadStream(filepath)
const startLine = 'const obj = {\n';
let content = '';
const readPromise = new Promise(function(resolve, reject) {
const lineReader = readline.createInterface({
input: inputStream
});
inputStream.on('end', () => {
resolve('end');
});
lineReader.on('line', (line) => {
let resultText = '';
const replaceDict = {
'#': '"',
':': '": "',
'\n': '",\n'
};
const regexString = /#|:|\n/;
const re = new RegExp(regexString, 'gim');
if(line.startsWith('#')) {
resultText = (line+`\n`).replace(re, (match) => { // \n to make sure its line end and for regex to work
return replaceDict[match];
});
} else if(line.startsWith('{')) {
resultText = line.replace('{', `}, \n {`);
} else {
resultText = line
}
content = content + resultText;
});
});
readPromise
.then((resolve) => {
if(resolve === 'end') {
content = startLine + content
fs.writeFile(writePath,
content,
error => {
if (error) {
console.error(error)
return
}
})
}
})

How to download excel file from nodejs terminal

I am new to nodejs. Need your help. From the nodejs terminal, i want to download an excel file and convert it to csv (say, mocha online.js). Note: i don't want to do this via a browser.
Below is a script i am working on to download and convert to csv. There is no error nor the expected result:
online.js
if (typeof require !== 'undefined') XLSX = require('xlsx');
var XMLHttpRequest = require("xmlhttprequest").XMLHttpRequest;
/* set up XMLHttpRequest */
var url = "http://oss.sheetjs.com/js-xlsx/test_files/formula_stress_test_ajax.xlsx";
var xhr = new XMLHttpRequest();
xhr.open("GET", url, true);
xhr.responseType = "arraybuffer";
describe('suite', function () {
it('case', function () {
var arraybuffer = xhr.response;
/* convert data to binary string */
var data = new Uint8Array(arraybuffer);
var arr = new Array();
for (var i = 0; i != data.length; ++i) arr[i] = String.fromCharCode(data[i]);
var bstr = arr.join("");
/* Call XLSX */
var sheetName = 'Database';
var workbook = XLSX.read(bstr, { type: "binary" });
var worksheet = workbook.Sheets[sheetName];
var csv = XLSX.utils.sheet_to_csv(worksheet);
console.log(csv);
xhr.send();
//.... perform validations here using the csv data
});
})}

I tried myself with this code, and it seems it is working, the only thing is that I spent 15 minutes trying to understand why my open office would not open the file, I eventually understood that they were sending a zip file ... here is the full code, the doc of the http get function is here http.get
You could have used the request module, but it isn't native, request is easier though.
enjoy!
const url = 'http://oss.sheetjs.com/js-xlsx/test_files/formula_stress_test_ajax.xlsx'
const http = require('http')
const fs = require('fs')
http.get(url, (res) => {
debugger
const {
statusCode
} = res;
const contentType = res.headers['content-type'];
console.log(`The type of the file is : ${contentType}`)
let error;
if (statusCode !== 200) {
error = new Error(`Request Failed.\n` +
`Status Code: ${statusCode}`);
}
if (error) {
console.error(error.message);
// consume response data to free up memory
res.resume();
return;
}
res.setEncoding('binary');
let rawData = '';
res.on('data', (chunk) => {
rawData += chunk;
});
res.on('end', () => {
try {
const parsedData = xlsxToCSVFunction(rawData);
// And / Or just put it in a file
fs.writeFileSync('fileName.zip', rawData, 'binary')
// console.log(parsedData);
} catch (e) {
console.error(e.message);
}
});
}).on('error', (e) => {
console.error(`Got error: ${e.message}`);
});
function xlsxToCSVFunction(rawData) {
return rawData //you should return the csv file here whatever your tools are
}

I actually encountered the same problem 3 months ago : here is what I did!
I did not find any nodeJS module that was exactly as I wanted, so I used in2csv (a python shell program) to transform the data; the t option is to use tabulation as the delimiter
1) Step 1: transforming the xlsx file into csv using in2csv
This code takes all the xlsx files in the current directory, transform them into csv files and put them in another directory
var shelljs = require('shelljs/global')
var dir = pwd().stdout.split('/')
dir = dir[dir.length - 1].replace(/\s/g, '\\ ')
mkdir('../'+ dir + 'CSV')
ls('*.xlsx').forEach(function(file) {
// below are the two lines you need
let string = 'in2csv -t ' + file.replace(/\s/g, '\\ ') + ' > ../'+ dir + 'CSV/' + file.replace('xlsx','csv').replace(/\s/g, '\\ ')
exec(string, {silent:true}, function(code, stdout, stderr){
console.log('new file : ' + file.replace('xlsx','csv'))
if(stderr){
console.log(string)
console.log('Program stderr:', stderr)
}
})
});
Step 2: loading the data in a nodejs program:
my script is very long but the main two lines are :
const args = fileContent.split('\n')[0].split(',')
const content = fileContent.split('\n').slice(1).map(e => e.split(','))

And for the benefit of seekers like me...here is a solution using mocha, request and xlsx
var request = require('request');
var XLSX = require('xlsx');
describe('suite', function () {
it('case', function (done) {
var url = "http://oss.sheetjs.com/js-xlsx/test_files/formula_stress_test_ajax.xlsx";
var options = {
url: url,
headers: {
'Content-Type': 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
},
encoding: null
};
request.get(options, function (err, res, body){
var arraybuffer = body;
/* convert data to binary string */
var data = arraybuffer;
//var data = new Uint8Array(arraybuffer);
var arr = new Array();
for (var i = 0; i != data.length; ++i) arr[i] = String.fromCharCode(data[i]);
var bstr = arr.join("");
/* Call XLSX */
var sheetName = 'Database';
var workbook = XLSX.read(bstr, { type: "binary" });
var worksheet = workbook.Sheets[sheetName];
var csv = XLSX.utils.sheet_to_csv(worksheet);
console.log(csv);
done();
});
});
});

Read csv files in stream and store them in database

I have a few huge csv files, what I need to store in a mongo database. Because these files are too big, I need to use stream. I pause the stream while the data writing into the database.
var fs = require('fs');
var csv = require('csv');
var mongo = require('mongodb');
var db = mongo.MongoClient.connect...
var readStream = fs.createReadStream('hugefile.csv');
readStream.on('data', function(data) {
readStream.pause();
csv.parse(data.toString(), { delimiter: ','}, function(err, output) {
db.collection(coll).insert(data, function(err) {
readStream.resume();
});
});
});
readStream.on('end', function() {
logger.info('file stored');
});
But the csv.parse drop an error, because I would need to read the files line by line to handle them as csv, and convert to json for the mongodb. Maybe I should not pause them, but use an interface. I didn't find any solution for this yet.
Any help would be appreciated!

I think you might want to create a stream of lines from your raw data stream.
Here is an example from the split package. https://www.npmjs.com/package/split
fs.createReadStream(file)
.pipe(split())
.on('data', function (line) {
//each chunk now is a seperate line!
})
Adapted to your example it might look like this
var readStream = fs.createReadStream('hugefile.csv');
var lineStream = readStream.pipe(split());
lineStream.on('data', function(data) {
//remaining code unmodified

I'm unsure if bulk() was a thing back in '15, but whosoever is trying to import items from large sources should consider using them.
var fs = require('fs');
var csv = require('fast-csv');
var mongoose = require('mongoose');
var db = mongoose.connect...
var counter = 0; // to keep count of values in the bulk()
const BULK_SIZE = 1000;
var bulkItem = Item.collection.initializeUnorderedBulkOp();
var readStream = fs.createReadStream('hugefile.csv');
const csvStream = csv.fromStream(readStream, { headers: true });
csvStream.on('data', data => {
counter++;
bulkOrder.insert(order);
if (counter === BATCH_SIZE) {
csvStream.pause();
bulkOrder.execute((err, result) => {
if (err) console.log(err);
counter = 0;
bulkItem = Item.collection.initializeUnorderedBulkOp();
csvStream.resume();
});
}
}
});

Node.js How to delete first line in file

I'm making simple Node.js app and I need to delete first line in file. Please is any way how to do it? I think that It will be possible with fs.write, but how?

Here is streamed version of removing first line from file.
As it uses streams, means you don't need to load whole file in memory, so it is way more efficient and fast, as well can work on very large files without filling memory on your hardware.
var Transform = require('stream').Transform;
var util = require('util');
// Transform sctreamer to remove first line
function RemoveFirstLine(args) {
if (! (this instanceof RemoveFirstLine)) {
return new RemoveFirstLine(args);
}
Transform.call(this, args);
this._buff = '';
this._removed = false;
}
util.inherits(RemoveFirstLine, Transform);
RemoveFirstLine.prototype._transform = function(chunk, encoding, done) {
if (this._removed) { // if already removed
this.push(chunk); // just push through buffer
} else {
// collect string into buffer
this._buff += chunk.toString();
// check if string has newline symbol
if (this._buff.indexOf('\n') !== -1) {
// push to stream skipping first line
this.push(this._buff.slice(this._buff.indexOf('\n') + 2));
// clear string buffer
this._buff = null;
// mark as removed
this._removed = true;
}
}
done();
};
And use it like so:
var fs = require('fs');
var input = fs.createReadStream('test.txt'); // read file
var output = fs.createWriteStream('test_.txt'); // write file
input // take input
.pipe(RemoveFirstLine()) // pipe through line remover
.pipe(output); // save to file
Another way, which is not recommended.
If your files are not large, and you don't mind loading them into memory, load file, remove line, save file, but it is slower and wont work well on large files.
var fs = require('fs');
var filePath = './test.txt'; // path to file
fs.readFile(filePath, function(err, data) { // read file to memory
if (!err) {
data = data.toString(); // stringify buffer
var position = data.toString().indexOf('\n'); // find position of new line element
if (position != -1) { // if new line element found
data = data.substr(position + 1); // subtract string based on first line length
fs.writeFile(filePath, data, function(err) { // write file
if (err) { // if error, report
console.log (err);
}
});
} else {
console.log('no lines found');
}
} else {
console.log(err);
}
});

Here is another way:
const fs = require('fs');
const filePath = './table.csv';
let csvContent = fs.readFileSync(filePath).toString().split('\n'); // read file and convert to array by line break
csvContent.shift(); // remove the the first element from array
csvContent = csvContent.join('\n'); // convert array back to string
fs.writeFileSync(filePath, csvContent);

Thanks to #Lilleman 's comment, I've made an amendment to the original solution, which requires a 3rd-party module "line-by-line" and can prevent memory overflow and racing condition while processing very large file.
const fs = require('fs');
const LineReader = require('line-by-line');
const removeLines = function(srcPath, destPath, count, cb) {
if(count <= 0) {
return cb();
}
var reader = new LineReader(srcPath);
var output = fs.createWriteStream(destPath);
var linesRemoved = 0;
var isFirstLine = true;
reader.on('line', (line) => {
if(linesRemoved < count) {
linesRemoved++;
return;
}
reader.pause();
var newLine;
if(isFirstLine) {
newLine = line;
isFirstLine = false;
} else {
newLine = '\n' + line;
}
output.write(newLine, () => {
reader.resume();
});
})
.on('error', (err) => {
reader.pause();
return cb(err);
})
.on('close', () => {
return cb();
})
}
---------------- original solution below---------------
Inspired by another answer, here is a revised stream version:
const fs = require('fs');
const readline = require('readline');
const removeFirstLine = function(srcPath, destPath, done) {
var rl = readline.createInterface({
input: fs.createReadStream(srcPath)
});
var output = fs.createWriteStream(destPath);
var firstRemoved = false;
rl.on('line', (line) => {
if(!firstRemoved) {
firstRemoved = true;
return;
}
output.write(line + '\n');
}).on('close', () => {
return done();
})
}
and it can be easily modified to remove certain amount of lines, by changing the 'firstRemoved' into a counter:
var linesRemoved = 0;
...
if(linesRemoved < LINES_TO_BE_REMOVED) {
linesRemoved++;
return;
}
...

Here is a naive solution using the Promise-based file system APIs.
const fs = require('node:fs/promises')
const os = require('node:os')
async function removeLines(path, numLinesToRemove) {
const data = await fs.readFile(path, { encoding: 'utf-8' })
const newData = data
.split(os.EOL) // split data into array of strings
.slice(numLinesToRemove) // remove first N lines of array
.join(os.EOL) // join array into a single string
// overwrite original file with new data
return fs.writeFile(path, newData)
}

Parsing huge logfiles in Node.js - read in line-by-line

I need to do some parsing of large (5-10 Gb)logfiles in Javascript/Node.js (I'm using Cube).
The logline looks something like:
10:00:43.343423 I'm a friendly log message. There are 5 cats, and 7 dogs. We are in state "SUCCESS".
We need to read each line, do some parsing (e.g. strip out 5, 7 and SUCCESS), then pump this data into Cube (https://github.com/square/cube) using their JS client.
Firstly, what is the canonical way in Node to read in a file, line by line?
It seems to be fairly common question online:
http://www.quora.com/What-is-the-best-way-to-read-a-file-line-by-line-in-node-js
Read a file one line at a time in node.js?
A lot of the answers seem to point to a bunch of third-party modules:
https://github.com/nickewing/line-reader
https://github.com/jahewson/node-byline
https://github.com/pkrumins/node-lazy
https://github.com/Gagle/Node-BufferedReader
However, this seems like a fairly basic task - surely, there's a simple way within the stdlib to read in a textfile, line-by-line?
Secondly, I then need to process each line (e.g. convert the timestamp into a Date object, and extract useful fields).
What's the best way to do this, maximising throughput? Is there some way that won't block on either reading in each line, or on sending it to Cube?
Thirdly - I'm guessing using string splits, and the JS equivalent of contains (IndexOf != -1?) will be a lot faster than regexes? Has anybody had much experience in parsing massive amounts of text data in Node.js?

I searched for a solution to parse very large files (gbs) line by line using a stream. All the third-party libraries and examples did not suit my needs since they processed the files not line by line (like 1 , 2 , 3 , 4 ..) or read the entire file to memory
The following solution can parse very large files, line by line using stream & pipe. For testing I used a 2.1 gb file with 17.000.000 records. Ram usage did not exceed 60 mb.
First, install the event-stream package:
npm install event-stream
Then:
var fs = require('fs')
, es = require('event-stream');
var lineNr = 0;
var s = fs.createReadStream('very-large-file.csv')
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
lineNr += 1;
// process line here and call s.resume() when rdy
// function below was for logging memory usage
logMemoryUsage(lineNr);
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(err){
console.log('Error while reading file.', err);
})
.on('end', function(){
console.log('Read entire file.')
})
);
Please let me know how it goes!

You can use the inbuilt readline package, see docs here. I use stream to create a new output stream.
var fs = require('fs'),
readline = require('readline'),
stream = require('stream');
var instream = fs.createReadStream('/path/to/file');
var outstream = new stream;
outstream.readable = true;
outstream.writable = true;
var rl = readline.createInterface({
input: instream,
output: outstream,
terminal: false
});
rl.on('line', function(line) {
console.log(line);
//Do your stuff ...
//Then write to output stream
rl.write(line);
});
Large files will take some time to process. Do tell if it works.

I really liked #gerard answer which is actually deserves to be the correct answer here. I made some improvements:
Code is in a class (modular)
Parsing is included
Ability to resume is given to the outside in case there is an asynchronous job is chained to reading the CSV like inserting to DB, or a HTTP request
Reading in chunks/batche sizes that
user can declare. I took care of encoding in the stream too, in case
you have files in different encoding.
Here's the code:
'use strict'
const fs = require('fs'),
util = require('util'),
stream = require('stream'),
es = require('event-stream'),
parse = require("csv-parse"),
iconv = require('iconv-lite');
class CSVReader {
constructor(filename, batchSize, columns) {
this.reader = fs.createReadStream(filename).pipe(iconv.decodeStream('utf8'))
this.batchSize = batchSize || 1000
this.lineNumber = 0
this.data = []
this.parseOptions = {delimiter: '\t', columns: true, escape: '/', relax: true}
}
read(callback) {
this.reader
.pipe(es.split())
.pipe(es.mapSync(line => {
++this.lineNumber
parse(line, this.parseOptions, (err, d) => {
this.data.push(d[0])
})
if (this.lineNumber % this.batchSize === 0) {
callback(this.data)
}
})
.on('error', function(){
console.log('Error while reading file.')
})
.on('end', function(){
console.log('Read entirefile.')
}))
}
continue () {
this.data = []
this.reader.resume()
}
}
module.exports = CSVReader
So basically, here is how you will use it:
let reader = CSVReader('path_to_file.csv')
reader.read(() => reader.continue())
I tested this with a 35GB CSV file and it worked for me and that's why I chose to build it on #gerard's answer, feedbacks are welcomed.

I used https://www.npmjs.com/package/line-by-line for reading more than 1 000 000 lines from a text file. In this case, an occupied capacity of RAM was about 50-60 megabyte.
const LineByLineReader = require('line-by-line'),
lr = new LineByLineReader('big_file.txt');
lr.on('error', function (err) {
// 'err' contains error object
});
lr.on('line', function (line) {
// pause emitting of lines...
lr.pause();
// ...do your asynchronous line processing..
setTimeout(function () {
// ...and continue emitting lines.
lr.resume();
}, 100);
});
lr.on('end', function () {
// All lines are read, file is closed now.
});

The Node.js Documentation offers a very elegant example using the Readline module.
Example: Read File Stream Line-by-Line
const { once } = require('node:events');
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('sample.txt'),
crlfDelay: Infinity
});
rl.on('line', (line) => {
console.log(`Line from file: ${line}`);
});
await once(rl, 'close');
Note: we use the crlfDelay option to recognize all instances of CR LF ('\r\n') as a single line break.

Apart from read the big file line by line, you also can read it chunk by chunk. For more refer to this article
var offset = 0;
var chunkSize = 2048;
var chunkBuffer = new Buffer(chunkSize);
var fp = fs.openSync('filepath', 'r');
var bytesRead = 0;
while(bytesRead = fs.readSync(fp, chunkBuffer, 0, chunkSize, offset)) {
offset += bytesRead;
var str = chunkBuffer.slice(0, bytesRead).toString();
var arr = str.split('\n');
if(bytesRead = chunkSize) {
// the last item of the arr may be not a full line, leave it to the next chunk
offset -= arr.pop().length;
}
lines.push(arr);
}
console.log(lines);

I had the same problem yet. After comparing several modules that seem to have this feature, I decided to do it myself, it's simpler than I thought.
gist: https://gist.github.com/deemstone/8279565
var fetchBlock = lineByline(filepath, onEnd);
fetchBlock(function(lines, start){ ... }); //lines{array} start{int} lines[0] No.
It cover the file opened in a closure, that fetchBlock() returned will fetch a block from the file, end split to array (will deal the segment from last fetch).
I've set the block size to 1024 for each read operation. This may have bugs, but code logic is obvious, try it yourself.

Reading / Writing files using stream with the native nodejs modules (fs, readline):
const fs = require('fs');
const readline = require('readline');
const rl = readline.createInterface({
input: fs.createReadStream('input.json'),
output: fs.createWriteStream('output.json')
});
rl.on('line', function(line) {
console.log(line);
// Do any 'line' processing if you want and then write to the output file
this.output.write(`${line}\n`);
});
rl.on('close', function() {
console.log(`Created "${this.output.path}"`);
});

Based on this questions answer I implemented a class you can use to read a file synchronously line-by-line with fs.readSync(). You can make this "pause" and "resume" by using a Q promise (jQuery seems to require a DOM so cant run it with nodejs):
var fs = require('fs');
var Q = require('q');
var lr = new LineReader(filenameToLoad);
lr.open();
var promise;
workOnLine = function () {
var line = lr.readNextLine();
promise = complexLineTransformation(line).then(
function() {console.log('ok');workOnLine();},
function() {console.log('error');}
);
}
workOnLine();
complexLineTransformation = function (line) {
var deferred = Q.defer();
// ... async call goes here, in callback: deferred.resolve('done ok'); or deferred.reject(new Error(error));
return deferred.promise;
}
function LineReader (filename) {
this.moreLinesAvailable = true;
this.fd = undefined;
this.bufferSize = 1024*1024;
this.buffer = new Buffer(this.bufferSize);
this.leftOver = '';
this.read = undefined;
this.idxStart = undefined;
this.idx = undefined;
this.lineNumber = 0;
this._bundleOfLines = [];
this.open = function() {
this.fd = fs.openSync(filename, 'r');
};
this.readNextLine = function () {
if (this._bundleOfLines.length === 0) {
this._readNextBundleOfLines();
}
this.lineNumber++;
var lineToReturn = this._bundleOfLines[0];
this._bundleOfLines.splice(0, 1); // remove first element (pos, howmany)
return lineToReturn;
};
this.getLineNumber = function() {
return this.lineNumber;
};
this._readNextBundleOfLines = function() {
var line = "";
while ((this.read = fs.readSync(this.fd, this.buffer, 0, this.bufferSize, null)) !== 0) { // read next bytes until end of file
this.leftOver += this.buffer.toString('utf8', 0, this.read); // append to leftOver
this.idxStart = 0
while ((this.idx = this.leftOver.indexOf("\n", this.idxStart)) !== -1) { // as long as there is a newline-char in leftOver
line = this.leftOver.substring(this.idxStart, this.idx);
this._bundleOfLines.push(line);
this.idxStart = this.idx + 1;
}
this.leftOver = this.leftOver.substring(this.idxStart);
if (line !== "") {
break;
}
}
};
}

node-byline uses streams, so i would prefer that one for your huge files.
for your date-conversions i would use moment.js.
for maximising your throughput you could think about using a software-cluster. there are some nice-modules which wrap the node-native cluster-module quite well. i like cluster-master from isaacs. e.g. you could create a cluster of x workers which all compute a file.
for benchmarking splits vs regexes use benchmark.js. i havent tested it until now. benchmark.js is available as a node-module

import * as csv from 'fast-csv';
import * as fs from 'fs';
interface Row {
[s: string]: string;
}
type RowCallBack = (data: Row, index: number) => object;
export class CSVReader {
protected file: string;
protected csvOptions = {
delimiter: ',',
headers: true,
ignoreEmpty: true,
trim: true
};
constructor(file: string, csvOptions = {}) {
if (!fs.existsSync(file)) {
throw new Error(`File ${file} not found.`);
}
this.file = file;
this.csvOptions = Object.assign({}, this.csvOptions, csvOptions);
}
public read(callback: RowCallBack): Promise < Array < object >> {
return new Promise < Array < object >> (resolve => {
const readStream = fs.createReadStream(this.file);
const results: Array < any > = [];
let index = 0;
const csvStream = csv.parse(this.csvOptions).on('data', async (data: Row) => {
index++;
results.push(await callback(data, index));
}).on('error', (err: Error) => {
console.error(err.message);
throw err;
}).on('end', () => {
resolve(results);
});
readStream.pipe(csvStream);
});
}
}
import { CSVReader } from '../src/helpers/CSVReader';
(async () => {
const reader = new CSVReader('./database/migrations/csv/users.csv');
const users = await reader.read(async data => {
return {
username: data.username,
name: data.name,
email: data.email,
cellPhone: data.cell_phone,
homePhone: data.home_phone,
roleId: data.role_id,
description: data.description,
state: data.state,
};
});
console.log(users);
})();

I have made a node module to read large file asynchronously text or JSON.
Tested on large files.
var fs = require('fs')
, util = require('util')
, stream = require('stream')
, es = require('event-stream');
module.exports = FileReader;
function FileReader(){
}
FileReader.prototype.read = function(pathToFile, callback){
var returnTxt = '';
var s = fs.createReadStream(pathToFile)
.pipe(es.split())
.pipe(es.mapSync(function(line){
// pause the readstream
s.pause();
//console.log('reading line: '+line);
returnTxt += line;
// resume the readstream, possibly from a callback
s.resume();
})
.on('error', function(){
console.log('Error while reading file.');
})
.on('end', function(){
console.log('Read entire file.');
callback(returnTxt);
})
);
};
FileReader.prototype.readJSON = function(pathToFile, callback){
try{
this.read(pathToFile, function(txt){callback(JSON.parse(txt));});
}
catch(err){
throw new Error('json file is not valid! '+err.stack);
}
};
Just save the file as file-reader.js, and use it like this:
var FileReader = require('./file-reader');
var fileReader = new FileReader();
fileReader.readJSON(__dirname + '/largeFile.json', function(jsonObj){/*callback logic here*/});

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

How to extract text from docx file with Nodejs - node.js

You can use any-text Usage is simeple: var reader = require('any-text'); reader.getText(`path-to-file`).then(function (data) { console.log(data); });

Related

How to read remove some lines from json file using node.js

How to download excel file from nodejs terminal

Read csv files in stream and store them in database

Node.js How to delete first line in file

Parsing huge logfiles in Node.js - read in line-by-line

Categories

Resources