trying to extract pdf data into json format with headers and paragraphs as nested child - node.js

I am trying to extract pdf data to json format like below using in adobe pdf extract api,
this is the json I am getting
I want it to be something like this
Tried lots of solution with nested approach but can't figure out a proper way.
I have tried a nested approach but it didn't give proper solution
here is code
`
const fs = require("fs");
// Read the JSON file
const jsonString = fs.readFileSync("structuredData.json", "utf8");
// Parse the JSON string into an object
const data = JSON.parse(jsonString);
// Initialize the output object
const output = {};
// Loop through each line in the data
data.elements.forEach((line) => {
// Get the path and text values
const path = line.Path;
const text = line.Text;
// Split the path into an array
const pathParts = path.split("/");
// Initialize the current object to be the output object
let current = output;
let prev = null;
// Loop through each part of the path
pathParts.forEach((part, index) => {
// If this is the last part of the path, set the text as the value
if (index === pathParts.length - 1) {
if (Object.keys(current)[0]) {
prevdata = current[Object.keys(current)[0]];
current[Object.keys(current)[0]] = { ...prevdata, [text]: {} };
} else [(current[text] = {})];
} else {
// If the current object doesn't have a property with this path part, create it
if (!current[part]) {
current[part] = {};
}
// Set the current object to be the nested object
current = current[part];
}
});
});
// Convert the output object to a JSON string and write it to a file
fs.writeFileSync("output.json", JSON.stringify(output));
`

Related

Reading file using Node.js "Invalid Encoding" Error

I am creating an application with Node.js and I am trying to read a file called "datalog.txt." I use the "append" function to write to the file:
//Appends buffer data to a given file
function append(filename, buffer) {
let fd = fs.openSync(filename, 'a+');
fs.writeSync(fd, str2ab(buffer));
fs.closeSync(fd);
}
//Converts string to buffer
function str2ab(str) {
var buf = new ArrayBuffer(str.length*2); // 2 bytes for each char
var bufView = new Uint16Array(buf);
for (var i=0, strLen=str.length; i < strLen; i++) {
bufView[i] = str.charCodeAt(i);
}
return buf;
}
append("datalog.txt","12345");
This seems to work great. However, now I want to use fs.readFileSync to read from the file. I tried using this:
const data = fs.readFileSync('datalog.txt', 'utf16le');
I changed the encoding parameter to all of the encoding types listed in the Node documentation, but all of them resulted in this error:
TypeError: Argument at index 2 is invalid: Invalid encoding
All I want to be able to do is be able to read the data from "datalog.txt." Any help would be greatly appreciated!
NOTE: Once I can read the data of the file, I want to be able to get a list of all the lines of the file.
Encoding and type are an object:
const data = fs.readFileSync('datalog.txt', {encoding:'utf16le'});
Okay, after a few hours of troubleshooting a looking at the docs I figured out a way to do this.
try {
// get metadata on the file (we need the file size)
let fileData = fs.statSync("datalog.txt");
// create ArrayBuffer to hold the file contents
let dataBuffer = new ArrayBuffer(fileData["size"]);
// read the contents of the file into the ArrayBuffer
fs.readSync(fs.openSync("datalog.txt", 'r'), dataBuffer, 0, fileData["size"], 0);
// convert the ArrayBuffer into a string
let data = String.fromCharCode.apply(null, new Uint16Array(dataBuffer));
// split the contents into lines
let dataLines = data.split(/\r?\n/);
// print out each line
dataLines.forEach((line) => {
console.log(line);
});
} catch (err) {
console.error(err);
}
Hope it helps someone else with the same problem!
This works for me:
index.js
const fs = require('fs');
// Write
fs.writeFileSync('./customfile.txt', 'Content_For_Writing');
// Read
const file_content = fs.readFileSync('./customfile.txt', {encoding:'utf8'}).toString();
console.log(file_content);
node index.js
Output:
Content_For_Writing
Process finished with exit code 0

How to search for files by extension and containing string within folder and subfolders?

Is it possible to look for files of given extension by the containing string provided?
What should my approach be? For example the input is txt and hello, and the output will be list of all files containing the string hello with extension txt.
You would write this in your terminal: node main.js hello
For a given directory it will search inside all subdirectories and all files for a text file with hello
Here is the code:
const { readdirSync, readFileSync, lstatSync } = require('fs');
const path = require('path');
const getDir = source => {
const results = readdirSync(source);
results.forEach(function (result) {
if (lstatSync(path.join(source, result))
.isFile()) {
if (readFileSync(path.join(source, result))
.includes(argument) && path.extname(result)
.toLowerCase() === extension) {
console.log("Your string is is in file: ", result)
}
}
else if (lstatSync(path.join(source, result))
.isDirectory()) {
getDir(path.join(source, result));
}
});
}
const dir = process.cwd();
const extension = '.txt'; //You can change the extension type here
let argument = process.argv[2];
getDir(dir);
You can use FileSystem to get the contents of a folder with the readdir method, this returns you an array of strings.
Let's say your working directory is a src file, in which there is a files folder that you want to read. Your script would look something like this:
const fs = require('fs');
var files = fs.readdir('./files', (err, filenames) => {
if (err) throw err;
return filenames;
})
You can then separate your string using string methods. Let's say you want to have two variables, fileNameand fileExt
You would use the String.split(separator) method, and use the . character as separator.
var files = // fs snippet above
for (file in files) {
let fileComponents = file.split('.');
let fileName = fileComponents[0];
let fileExt = fileComponents[1];
// You can run your code on the name, and extension of your file here.
}
This will not work for files containing multiple dots in their name. You will need extra work on the array to make sure fileName contains every string concatenated, separated by a . up until the final index of your fileComponents string

Write a line into a .txt file with Node.js

I want to use Node.js to create a simple logging system which prints a line before the past line into a .txt file. However, I don't know how the file system functionality from Node.js works.
Can someone explain it?
Inserting data into the middle of a text file is not a simple task. If possible, you should append it to the end of your file.
The easiest way to append data some text file is to use build-in fs.appendFile(filename, data[, options], callback) function from fs module:
var fs = require('fs')
fs.appendFile('log.txt', 'new data', function (err) {
if (err) {
// append failed
} else {
// done
}
})
But if you want to write data to log file several times, then it'll be best to use fs.createWriteStream(path[, options]) function instead:
var fs = require('fs')
var logger = fs.createWriteStream('log.txt', {
flags: 'a' // 'a' means appending (old data will be preserved)
})
logger.write('some data') // append string to your file
logger.write('more data') // again
logger.write('and more') // again
Node will keep appending new data to your file every time you'll call .write, until your application will be closed, or until you'll manually close the stream calling .end:
logger.end() // close string
Note that logger.write in the above example does not write to a new line. To write data to a new line:
var writeLine = (line) => logger.write(`\n${line}`);
writeLine('Data written to a new line');
Simply use fs module and something like this:
fs.appendFile('server.log', 'string to append', function (err) {
if (err) return console.log(err);
console.log('Appended!');
});
Step 1
If you have a small file
Read all the file data in to memory
Step 2
Convert file data string into Array
Step 3
Search the array to find a location where you want to insert the text
Step 4
Once you have the location insert your text
yourArray.splice(index,0,"new added test");
Step 5
convert your array to string
yourArray.join("");
Step 6
write your file like so
fs.createWriteStream(yourArray);
This is not advised if your file is too big
I created a log file which prints data into text file using "Winston" logger. The source code is here below,
const { createLogger, format, transports } = require('winston');
var fs = require('fs')
var logger = fs.createWriteStream('Data Log.txt', {
flags: 'a'
})
const os = require('os');
var sleep = require('system-sleep');
var endOfLine = require('os').EOL;
var t = ' ';
var s = ' ';
var q = ' ';
var array1=[];
var array2=[];
var array3=[];
var array4=[];
array1[0] = 78;
array1[1] = 56;
array1[2] = 24;
array1[3] = 34;
for (var n=0;n<4;n++)
{
array2[n]=array1[n].toString();
}
for (var k=0;k<4;k++)
{
array3[k]=Buffer.from(' ');
}
for (var a=0;a<4;a++)
{
array4[a]=Buffer.from(array2[a]);
}
for (m=0;m<4;m++)
{
array4[m].copy(array3[m],0);
}
logger.write('Date'+q);
logger.write('Time'+(q+' '))
logger.write('Data 01'+t);
logger.write('Data 02'+t);
logger.write('Data 03'+t);
logger.write('Data 04'+t)
logger.write(endOfLine);
logger.write(endOfLine);
function mydata() //user defined function
{
logger.write(datechar+s);
logger.write(timechar+s);
for ( n = 0; n < 4; n++)
{
logger.write(array3[n]);
}
logger.write(endOfLine);
}
var now = new Date();
var dateFormat = require('dateformat');
var date = dateFormat(now,"isoDate");
var time = dateFormat(now, "h:MM:ss TT ");
var datechar = date.toString();
var timechar = time.toString();
mydata();
sleep(5*1000);

How to use templating (handlebars, or any alternative) with Node.js and without using a framework (ex = express)?

For example, I have this JSON document "foo.json":
{
"foo": [
{
"bar": "Hello World!"
},
{
"bar": "The End"
}
]
}
In Node.js, I would like to use templating (handlebars or any) to generate a string from the JSON document, such as:
<p>Hello World!</p><p>The End</p>
... And then assign that string value to a variable in Node.js. Finally, I'll concatenate more values to the variable and output the final variable value as an html document.
Can this be done without using a framework like Express?
If you want to use handlebars, just grab the npm module:
npm install handlebars
Then in your script, you can use handlebars to render your output based on a simple template that iterates over the array foo and creates a <p> for each item, containing the text of the bar property:
var handlebars = require('handlebars');
// get your data into a variable
var fooJson = require('foo.json');
// set up your handlebars template
var source = '{{#each foo}}<p>{{this.bar}}</p>{{/each}}';
// compile the template
var template = handlebars.compile(source);
// call template as a function, passing in your data as the context
var outputString = template(fooJson);
If you want to use a .hbs template file instead of a string source you can use the fs module to read the file with fs.readFile, call toString() on the returned buffer, and use that to call a rendering function. Try this:
var handlebars = require('handlebars');
var fs = require('fs');
// get your data into a variable
var fooJson = require('path/to/foo.json');
// read the file and use the callback to render
fs.readFile('path/to/source.hbs', function(err, data){
if (!err) {
// make the buffer into a string
var source = data.toString();
// call the render function
renderToString(source, fooJson);
} else {
// handle file read error
}
});
// this will be called after the file is read
function renderToString(source, data) {
var template = handlebars.compile(source);
var outputString = template(data);
return outputString;
}

In Node.js, how to read a file, append a string at a specified line or delete a string from a certain line?

I need to open an existing JavaScript file, check if this string exists:
var LocalStrategy = require('passport-local').Strategy;
If it doesn't, then append it at the top with the rest of require() lines.
In another case, I need to check if that string exists, and if it does, I would like to remove just that line.
I have looked at fs.readFile, fs.writeFile, fs.open but I don't think it is capable of doing what I need. Any suggestions?
This is a simplified script:
var fs = require('fs');
var search = "var LocalStrategy = require('passport-local').Strategy;";
function append (line) {
line = line || 0;
var body = fs.readFileSync('example.js').toString();
if (body.indexOf(search) < 0 ) {
body = body.split('\n');
body.splice(line + 1,0,search);
body = body.filter(function(str){ return str; }); // remove empty lines
var output = body.join('\n');
fs.writeFileSync('example.js', output);
}
}
function remove () {
var body = fs.readFileSync('example.js').toString();
var idx = body.indexOf(search);
if (idx >= 0 ) {
var output = body.substr(0, idx) + body.substr(idx + search.length);
fs.writeFileSync('example.js', output);
}
}

Resources