Nodejs Script Not Reading GDOC File Extension

Nodejs Script Not Reading GDOC File Extension - node.js

I am using the Google Drive for Developers Drive API (V3) Nodejs quickstart.
In particular I am concentrating on the following function. Where I have customized the pageSize to 1 for testing. And am calling my function read(file.name);
/**
* Lists the names and IDs of up to 10 files.
* #param {google.auth.OAuth2} auth An authorized OAuth2 client.
*/
function listFiles(auth) {
const drive = google.drive({version: 'v3', auth});
drive.files.list({
pageSize: 1, // only find the last modified file in dev folder
fields: 'nextPageToken, files(id, name)',
}, (err, res) => {
if (err) return console.log('The API returned an error: ' + err);
const files = res.data.files;
if (files.length) {
console.log('Files:');
files.map((file) => {
console.log(`${file.name} (${file.id})`);
read(file.name); // my function here
});
} else {
console.log('No files found.');
}
});
}
// custom code - function to read and output file contents
function read(fileName) {
const readableStream = fs.createReadStream(fileName, 'utf8');
readableStream.on('error', function (error) {
console.log(`error: ${error.message}`);
})
readableStream.on('data', (chunk) => {
console.log(chunk);
})
}
This code reads the file from the Google Drive folder that is synced. I am using this local folder for development. I have found the pageSize: 1 parameter produces the last file that has been modified in this local folder. Therefore my process has been:
Edit .js code file
Make minor edit on testfiles (first txt then gdoc) to ensure it is last modified
Run the code
I am testing a text file against a GDOC file. The filenames are atest.txt & 31832_226114__0001-00028.gdoc respectively. The outputs are as follows:
PS C:\Users\david\Google Drive\Technical-local\gDriveDev> node . gdocToTextDownload.js
Files:
atest.txt (1bm1E4s4ET6HVTrJUj4TmNGaxqJJRcnCC)
atest.txt this is a test file!!
PS C:\Users\david\Google Drive\Technical-local\gDriveDev> node . gdocToTextDownload.js
Files:
31832_226114__0001-00028 (1oi_hE0TTfsKG9lr8Wl7ahGNvMvXJoFj70LssGNFFjOg)
error: ENOENT: no such file or directory, open 'C:\Users\david\Google Drive\Technical-local\gDriveDev\31832_226114__0001-00028'
My question is:
Why does the script read the text file but not the gdoc?
At this point I must 'hard code' the gdoc file extension to the file name, in the function call, to produce the required output as per the text file example eg
read('31832_226114__0001-00028.gdoc');
Which is obviously not what I want to do.
I am aiming to produce a script that will download a large number of gdocs that have been created from .jpg files.
------------------------- code completed below ------------------------
/**
* Lists the names and IDs of pageSize number of files (using query to define folder of files)
* #param {google.auth.OAuth2} auth An authorized OAuth2 client.
*/
function listFiles(auth) {
const drive = google.drive({version: 'v3', auth});
drive.files.list({
corpora: 'user',
pageSize: 100,
// files in a parent folder that have not been trashed
// get ID from Drive > Folder by looking at the URL after /folders/
q: `'11Sejh6XG-2WzycpcC-MaEmDQJc78LCFg' in parents and trashed=false`,
fields: 'nextPageToken, files(id, name)',
}, (err, res) => {
if (err) return console.log('The API returned an error: ' + err);
const files = res.data.files;
if (files.length) {
var ids = [ ];
var names = [ ];
files.forEach(function(file, i) {
ids.push(file.id);
names.push(file.name);
});
ids.forEach((fileId, i) => {
fileName = names[i];
downloadFile(drive, fileId, fileName);
});
}
else
{
console.log('No files found.');
}
});
}
/**
* #param {google.auth.OAuth2} auth An authorized OAuth2 client.
*/
function downloadFile(drive, fileId, fileName) {
// make sure you have valid path & permissions. Use UNIX filepath notation.
const filePath = `/test/test1/${fileName}`;
const dest = fs.createWriteStream(filePath);
let progress = 0;
drive.files.export(
{ fileId, mimeType: 'text/plain' },
{ responseType: 'stream' }
).then(res => {
res.data
.on('end', () => {
console.log(' Done downloading');
})
.on('error', err => {
console.error('Error downloading file.');
})
.on('data', d => {
progress += d.length;
if (process.stdout.isTTY) {
process.stdout.clearLine();
process.stdout.cursorTo(0);
process.stdout.write(`Downloading ${fileName} ${progress} bytes`);
}
})
.pipe(dest);
});
}

My question is: Why does the script read the text file but not the gdoc?
This is because you're trying to download a Google Workspace document, only files with binary content can be downloaded using drive.files.get method. For Google Workspace documents you need to use drive.files.exports as documented here
From your code, I'm seeing you're only listing the files, you will need to identify the type of file you want to download, you can use the mimeType field to check if you need to use the exports method vs get, for example, a Google Doc mime type is application/vnd.google-apps.document meanwhile a docx file (binary) would be application/vnd.openxmlformats-officedocument.wordprocessingml.document
Check the following working example:
Download a file from Google Drive                                                                                
Run in Fusebit
const fs = require("fs");
const getFile = async (drive, fileId, name) => {
const res = await drive.files.get({ fileId, alt: "media" }, { responseType: "stream" });
return new Promise((resolve, reject) => {
const filePath = `/tmp/${name}`;
console.log(`writing to ${filePath}`);
const dest = fs.createWriteStream(filePath);
let progress = 0;
res.data
.on("end", () => {
console.log("🎉 Done downloading file.");
resolve(filePath);
})
.on("error", (err) => {
console.error("🚫 Error downloading file.");
reject(err);
})
.on("data", (d) => {
progress += d.length;
console.log(`🕛 Downloaded ${progress} bytes`);
})
.pipe(dest);
});
};
const fileKind = "drive#file";
let filesCounter = 0;
const drive = googleClient.drive({ version: "v3" });
const files = await drive.files.list();
// Only files with binary content can be downloaded. Use Export with Docs Editors files
// Read more at https://developers.google.com/drive/api/v3/reference/files/get
// In this example, any docx folder will be downloaded in a temp folder.
const onlyFiles = files.data.files.filter(
(file) =>
file.kind === fileKind &&
file.mimeType === "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
);
const numberOfFilesToDownload = onlyFiles.length;
console.log(`😏 About to download ${numberOfFilesToDownload} files`);
for await (const file of onlyFiles) {
filesCounter++;
console.log(`📁 Downloading file ${file.name}, ${filesCounter} of ${numberOfFilesToDownload}`);
await getFile(drive, file.id, file.name);
}

The answer (as I see it) is that the nodejs script above is running on Windows and therefore must comply with the native OS/file system inherited via the DOS/NT development of Windows. On the other hand, the gdoc extension is a reference created by the Google Drive sync desktop client. And here is the important distinction. The gdoc extension references a file stored on Google Drive (the reference being in the sync folder on a hard drive and the file being in the cloud on Google Drive) Therefore it's not an extension in the usual sense. The usual sense being where the extension is used by a local application as a valid access/read/write file type. So my test function above function read(fileName) won't be able to read the .gdoc in the same way as the .txt extension.
Therefore the correct way to access files on Google Drive from a local application is to use the file's ID. The filename is just a convenient way of labelling the IDs so that the user can meaningfully compare the downloaded copy of the file with the original on Google Drive.
(Refer to the original question) Using the code under the ---------- code completed below --------- I have added these two functions to Google's Nodejs Quickstart Replacing the function listFiles(auth) and adding function downloadFile(drive, fileId, fileName)
The total script file has been used to download multiple files (more than 50 at a time) to my hard drive. This is a useful piece of code in an OCR setup which has a gscript convert .JPG images of historic Electoral Rolls into readable text. These gdocs are messy (still containing the original image and colored fonts of various formats) In downloading as text files the above script cleans them up. Of course images are removed from text files and the fonts are standardized to just upper/lower case text. So, it's more than just a downloader. It's a filter as well.
I hope this of some use to someone.

Related

how to pipe file during download with puppteer?

Is that possible to pipe during download file with Puppter?
Code attached are example of download with puppteer while the second part is how i extract file during download.
i want to include part 2 in part 1 somehow.
const page = await browser.newPage();//skiped other configs
const client = await page.target().createCDPSession();//set directory of files
await client.send("Page.setDownloadBehavior", {
behavior: "allow",
downloadPath: process.cwd() + "\\src\\tempDataFiles\\rami",
});
//array of links from page
const fileUrlArray = await page.$$eval("selector", (files) => {
return files.map((link) => {link.getAttribute("href")});
//download files
const filtredFiles = fileUrlArray.filter((url) => url !== null);
for (const file of filtredFiles) {
await page.click(`[href="${file}"]`);
}
This code works perfect but the files are zip files and i want to extract them before save.
When i download file without puppter, the extraction is as next code.
**(in this case im not able use https request yet due to lake of knowledge)
The code of unzip file directly when download without puppter (simple http request)
const file = fs.createWriteStream(`./src/tempDataFiles/${store}/${fileName}.xml`);
const request = http.get(url, function (response) {
response.pipe(zlib.createGunzip()).pipe(file);
file.on("error", async (e) => {
Log.error(`downloadAndUnZipFile error : ${e}`);
await fileOnDb.destroy();
file.end();
});
file.on("finish", () => {
Log.success(`${fileName} download Completed`);
file.close();
});
});

Smooch - create attachments from buffer

I'm trying to create an image via smooch-core API
I have an image as Buffer - base64, And I try something like this:
smoochClient.attachments
.create({
appId: appId,
props: {
for: 'message',
access: 'public',
appUserId: appUserId
},
source: myBuffer
})
.then(() => {
console.log('OK');
}).catch(err => {
console.log(JSON.stringify(err));
});
I get this error: "status":413,"statusText":"Payload Too Large"
[When I create this image normally through Postman it does work well, so it's not too big - I guess it's because of the Buffer's sending]
Anyone know how I can send a buffer to this API?

Are you able to submit the base64 data directly in the postman call?
Reading through the spec here it looks like source should be a filepath/name, and not raw binary data.
The easy way may be to save the base64 data to a[n appropriately encoded] file, then provide that file's path as source
Otherwise I'm not sure I'd go so far as to take apart api_instance.upload_attachment() to feed in the base64 data instead of opening/reading from the specified filename.

I found such a solution:
Create a temporary file to get it's read stream and send it in source instead of the myBuffer parameter and here is the code of creating the temporary file:
async getTempFileSource(bufferData) {
const fs = require("fs");
//remove mime type
if (bufferData.startsWith('data:'))
bufferData = bufferData.split('base64,')[1];
//Get file extension
const type = await require('file-type').fromBuffer(new Buffer(bufferData, 'base64'));
if (!type) {
console.log("getTempFileSource - The buffer data is corrupted", 'red');
return null;
}
//create temporary file
const tempFile = require('tmp').fileSync({postfix: '.' + type.ext});
//append buffer data to temp file
fs.appendFileSync(tempFile.name, new Buffer(bufferData, 'base64'));
//create read stream from the temp file
const source = fs.createReadStream(tempFile.name);
//remove the temp file
tempFile.removeCallback();
return source;
}
Here is the code for creating the attachment:
return new Promise(async (resolve, reject) => {
const source = await getTempFileSource(bufferData);
if (!source)
resolve(null);
else {
session.smoochClient.attachments
.create({
appId: appId,
props: {
for: 'message',
access: 'public',
appUserId: appUserId
},
source: source
})
.then(res => {
resolve(res);
}).catch(err => {
reject(err);
});
}
});

Listing folders and files within them from particular folder in Google drive

I am trying to get the list of files and folders from a particular folder in google drive using node.js. However I am only getting the list of folders and not the files within those folders. What am I doing wrong?
Here is my code.
app.get('/getFF', (req, res) => {
var folderid = <'my folder id'>;
var query = "'" + folderid + "' in parents";
drive.files.list({q: query, fields: 'files(id)'}, (err, resp) => {
if (err) throw err;
const files = resp.data.files;
if (files.length) {
files.map((file) => {
console.log(file);
});
res.send(files);
} else {
resp.send('No files found');
}
});
})

I believe your goal and your current situation as follows.
You want to retrieve the file list under the specific folder.
The specific folder has the subfolders.
You want to achieve this using googleapis for Node.js.
You have already been able to retrieve the file list using Drive API v3 with googleapis.
Modification points:
In the current stage, when drive.files.list({q: query, fields: 'files(id)'}) with var query = "'" + folderid + "' in parents" is used, the file list of the files and folders just under the folder of folderid are retrieved. It seems that this is the current specification. In order to retrieve the file list of all files under the specific folder which has the subfolders, it is required to retrieve the file list from each subfolder.
In this answer, I would like to propose to use a library of node-getfilelist for Node.js. This library can retrieve the file list of all files under the specific folder which has the subfolders. I created this library for such situation.
Usage:
1. Install library.
At first, please install the library as follows.
$ npm install --save-dev google-drive-getfilelist
or
$ npm install --global google-drive-getfilelist
2. Sample script.
const getfilelist = require("google-drive-getfilelist");
const topFolderId = "###"; // Please set the top folder ID.
getfilelist.GetFileList(
{
auth: auth,
fields: "files(id)",
id: topFolderId,
},
(err, res) => {
if (err) {
console.log(err);
return;
}
const fileList = res.fileList.flatMap(({ files }) => files);
console.log(fileList);
}
);
In this case, auth is the auth of const drive = google.drive({version: 'v3', auth}).
Result:
When above script is run, the following result is obtained at console.log(fileList);. fileList is the file list of all files under the specific folder which has the subfolders.
[
{ id: '###' },
{ id: '###' },
,
,
,
]
Reference:
node-getfilelist

How to use bucket.upload() instead of file.createWriteStream() in Google Cloud Storage?

I'm trying to get the permanent (unsigned) download URL after uploading a file to Google Cloud Storage. I can get the signed download URL using file.createWriteStream() but file.createWriteStream() doesn't return the UploadResponse that includes the unsigned download URL. bucket.upload() includes the UploadResponse, and Get Download URL from file uploaded with Cloud Functions for Firebase has several answers explaining how to get the unsigned download URL from the UploadResponse. How do I change file.createWriteStream() in my code to bucket.upload()? Here's my code:
const {Storage} = require('#google-cloud/storage');
const storage = new Storage({ projectId: 'my-app' });
const bucket = storage.bucket('my-app.appspot.com');
var file = bucket.file('Audio/' + longLanguage + '/' + pronunciation + '/' + wordFileType);
const config = {
action: 'read',
expires: '03-17-2025',
content_type: 'audio/mp3'
};
function oedPromise() {
return new Promise(function(resolve, reject) {
http.get(oedAudioURL, function(response) {
response.pipe(file.createWriteStream(options))
.on('error', function(error) {
console.error(error);
reject(error);
})
.on('finish', function() {
file.getSignedUrl(config, function(err, url) {
if (err) {
console.error(err);
return;
} else {
resolve(url);
}
});
});
});
});
}
I tried this, it didn't work:
function oedPromise() {
return new Promise(function(resolve, reject) {
http.get(oedAudioURL, function(response) {
bucket.upload(response, options)
.then(function(uploadResponse) {
console.log('Then do something with UploadResponse.');
})
.catch(error => console.error(error));
});
});
}
The error message was Path must be a string. In other words, response is a variable but needs to be a string.

I used the Google Cloud text-to-speech API to simulate what you are doing. Getting the text to create the audio file from a text file. Once the file was created, I used the upload method to add it to my bucket and the makePublic method to got its public URL. Also I used the async/await feature offered by node.js instead of function chaining (using then) to avoid the 'No such object: ..." error produced because the makePublic method is executed before the file finishes uploading to the bucket.
// Imports the Google Cloud client library
const {Storage} = require('#google-cloud/storage');
// Creates a client using Application Default Credentials
const storage = new Storage();
// Imports the Google Cloud client library
const textToSpeech = require('#google-cloud/text-to-speech');
// Get the bucket
const myBucket = storage.bucket('my_bucket');
// Import other required libraries
const fs = require('fs');
const util = require('util');
// Create a client
const client = new textToSpeech.TextToSpeechClient();
// Create the variable to save the text to create the audio file
var text = "";
// Function that reads my_text.txt file (which contains the text that will be
// used to create my_audio.mp3) and saves its content in a variable.
function readFile() {
// This line opens the file as a readable stream
var readStream = fs.createReadStream('/home/usr/my_text.txt');
// Read and display the file data on console
readStream.on('data', function (data) {
text = data.toString();
});
// Execute the createAndUploadFile() fuction until the whole file is read
readStream.on('end', function (data) {
createAndUploadFile();
});
}
// Function that uploads the file to the bucket and generates it public URL.
async function createAndUploadFile() {
// Construct the request
const request = {
input: {text: text},
// Select the language and SSML voice gender (optional)
voice: {languageCode: 'en-US', ssmlGender: 'NEUTRAL'},
// select the type of audio encoding
audioConfig: {audioEncoding: 'MP3'},
};
// Performs the text-to-speech request
const [response] = await client.synthesizeSpeech(request);
// Write the binary audio content to a local file
const writeFile = util.promisify(fs.writeFile);
await writeFile('my_audio.mp3', response.audioContent, 'binary');
console.log('Audio content written to file: my_audio.mp3');
// Wait for the myBucket.upload() function to complete before moving on to the
// next line to execute it
let res = await myBucket.upload('/home/usr/my_audio.mp3');
// If there is an error, it is printed
if (res.err) {
console.log('error');
}
// If not, the makePublic() fuction is executed
else {
// Get the file in the bucket
let file = myBucket.file('my_audio.mp3');
file.makePublic();
}
}
readFile();

bucket.upload() is a convenience wrapper around file.createWriteStream() that takes a local filesystem path and upload the file into the bucket as an object:
bucket.upload("path/to/local/file.ext", options)
.then(() => {
// upload has completed
});
To generate a signed URL, you'll need to get a file object from the bucket:
const theFile = bucket.file('file_name');
The file name will either be that of your local file, or if you specified an alternate remote name options.destination for the file on GCS.
Then, use File.getSignedUrl() to get a signed URL:
bucket.upload("path/to/local/file.ext", options)
.then(() => {
const theFile = bucket.file('file.ext');
return theFile.getSignedURL(signedUrlOptions); // getSignedURL returns a Promise
})
.then((signedUrl) => {
// do something with the signedURL
});
See:
Bucket.upload() documentation
File.getSignedUrl() documentation

You can make a specific file in a bucket publicly readable with the method makePublic.
From the docs:
const {Storage} = require('#google-cloud/storage');
const storage = new Storage();
// 'my-bucket' is your bucket's name
const myBucket = storage.bucket('my-bucket');
// 'my-file' is the path to your file inside your bucket
const file = myBucket.file('my-file');
file.makePublic(function(err, apiResponse) {});
//-
// If the callback is omitted, we'll return a Promise.
//-
file.makePublic().then(function(data) {
const apiResponse = data[0];
});
Now the URI http://storage.googleapis.com/[BUCKET_NAME]/[OBJECT_NAME] is a public link to the file, as explained here.
The point is that you only need this minimal code to make an object public, for instance with a Cloud Function. Then you already know how the public link is and can use it directly in your app.

How to automate Google Drive Docs OCR facility?

I have using Google Drive and its Open with Google Docs facility to convert them into OCR word file (.docx). Because the word file preserves the formatting also. I have many images and upload them to Drive and convert them into editable one by one because PDF conversion does not work.
In this time I want to wait patiently to finish one conversion process. After that I start the next conversion, it is time consuming.
I used Google OCR API. But it does not preserve the formatting such as bold, alignment, etc.
So, is there any way to automate this process using REST API?
UPDATE
Uploaded images to the Google Drive
The Right click context menu of an image in Google Drive
Google Docs in the context menu of "Open with"
After the conversion process the OCR(Auto language detected)
Finally the Google document and the image
I tried the googleapis on GitHub and I selected the drive sample list.js code.
My Code
'use strict';
const {google} = require('googleapis');
const sampleClient = require('../sampleclient');
const drive = google.drive({
version: 'v3',
auth: sampleClient.oAuth2Client,
});
async function runSample(query) {
const params = {pageSize: 3};
params.q = query;
const res = await drive.files.list(params);
console.log(res.data);
return res.data;
}
if (module === require.main) {
const scopes = ['https://www.googleapis.com/auth/drive.metadata.readonly'];
sampleClient
.authenticate(scopes)
.then(runSample)
.catch(console.error);
}
module.exports = {
runSample,
client: sampleClient.oAuth2Client,
};

How about this modification?
From your sample script, it was found that you are using googleapis. So in this modification, I also used googleapis. The image files in Drive are converted to Google Document with OCR by files.copy method in Drive API. The following modification supposes the following points.
You are using googleapis in Node.js.
When you run your script, you have already retrieved file list by Drive API.
This indicates that drive in your script can be also used for the files.copy method.
Notes :
If you have not used Drive API yet, please check the quickstart. (version 3).
Confirmation point:
Before you run the script, please confirm the following points.
In order to use the files.copy method, please include https://www.googleapis.com/auth/drive to the scopes in if statement in list.js.
Modified script 1 (to convert Google Docs with OCR by giving files() id:
In this modification, runSample() was modified.
function runSample()
{
// Please set the file(s) IDs of sample images in Google Drive.
const files = [
"### fileId1 ###",
"### fileId2 ###",
"### fileId3 ###", , ,
];
// takes each file and convert them to Google Docs format
files.forEach((id) =>
{
const params = {
fileId: id,
resource:
{
mimeType: 'application/vnd.google-apps.document',
parents: ['### folderId ###'], // If you want to put the converted files in a specific folder, please use this.
},
fields: 'id',
};
// Convert after processes here
// Here we copy the IDs
drive.files.copy(params, (err, res) =>
{
if (err)
{
console.error(err);
return;
}
console.log(res.data.id);
});
});
}
Note:
Your files(images) are converted to Google Document by above script, and it seems that the result (Google document) is same as your sample (in your question). But I'm not sure whether this is the quality which you want, please apologize.
References:
Node.js Quickstart
Files: copy
Mimetypes
Modified script 2 (to convert Google Docs with OCR by single folder and selects only images:
You want to convert the files(images) to Google Document by retrieving them from a specific folder.
You want to retrieve files of image/png, image/jpeg and image/tiff.
Sample code syntax:
const folderId = "### folderId ###"; // Please set the folder ID including the images.
drive.files.list(
{
pageSize: 1000,
q: `'${folderId}' in parents and (mimeType='image/png' or mimeType='image/jpeg' or mimeType='image/tiff')`,
fields: 'files(id)',
}, (err, res) =>
{
if (err)
{
console.error(err);
return;
}
const files = res.data.files;
files.forEach((file) =>
{
console.log(file.id);
// Please put above script of the files.forEach method by modifying ``id`` to ``file.id``.
});
});
In this next modification, entire runSample() was modified.
function runSample()
{
// Put the folder ID including files you want to convert.
const folderId = "### folderId ###";
// Retrieve file list.
drive.files.list(
{
pageSize: 1000,
q: `'${folderId}' in parents and (mimeType='image/png' or mimeType='image/jpeg' or mimeType='image/tiff')`,
fields: 'files(id)',
}, (err, res) =>
{
if (err)
{
console.error(err);
return;
}
const files = res.data.files;
// Retrieve each file from the retrieved file list.
files.forEach((file) =>
{
const params = {
fileId: file.id,
resource:
{
mimeType: 'application/vnd.google-apps.document',
parents: ['### folderId ###'],
},
fields: 'id',
};
// Convert a file
drive.files.copy(params, (err, res) =>
{
if (err)
{
console.error(err);
return;
}
console.log(res.data.id);
});
});
});
}
References:
Files.list
Image MiMe types

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Nodejs Script Not Reading GDOC File Extension - node.js

Related

how to pipe file during download with puppteer?

Smooch - create attachments from buffer

Listing folders and files within them from particular folder in Google drive

How to use bucket.upload() instead of file.createWriteStream() in Google Cloud Storage?

How to automate Google Drive Docs OCR facility?

Categories

Resources