Google Vision | Vietnamese: Low Quality OCR Results - node.js

Background
Using Google Vision API (with Node) to recognize Vietnamese text, the result is lacking quality. There are some (not all but some) tone markers as well as vowel signifies missing.
Compared to their online demo, which returns a decent result (scroll down for live demo):
https://cloud.google.com/vision/
(As I do not have a company account with them, I cannot ask Google directly.)
Question
Can I tweak my request to get better results?
I already set the language hint to "vi" and tried to combine it with "en". I also tried the more specific "vi-VN".
Example Image
https://www.tecc.org/Slatwall/custom/assets/images/product/default/cache/j056vt-_800w_800h_sb.jpg
Example Code
const fs = require("fs");
const path = require("path");
const vision = require("#google-cloud/vision");
async function quickstart() {
let text;
const fileName = "j056vt-_800w_800h_sb.jpg";
const imageFile = fs.readFileSync(fileName);
const image = Buffer.from(imageFile).toString("base64");
const client = new vision.ImageAnnotatorClient();
const request = {
image: {
content: image
},
imageContext: {
languageHints: ["vi", 'en']
}
};
const [result] = await client.textDetection(request);
for (const tmp of result.textAnnotations) {
text += tmp.description + '\n';
}
const out = path.basename(fileName, path.extname(fileName)) + ".txt";
fs.writeFileSync(out, text);
}
quickstart();
Solution
// $env:GOOGLE_APPLICATION_CREDENTIALS="[PATH]"
const fs = require("fs");
const path = require("path");
const vision = require("#google-cloud/vision");
async function quickstart() {
let text = '';
const fileName = "j056vt-_800w_800h_sb.jpg";
const imageFile = fs.readFileSync(fileName);
const image = Buffer.from(imageFile).toString("base64");
const client = new vision.ImageAnnotatorClient();
const request = {
image: {
content: image
},
imageContext: {
languageHints: ["vi-VN"]
}
};
const [result] = await client.documentTextDetection(request);
// OUTPUT METHOD A
for (const tmp of result.textAnnotations) {
text += tmp.description + "\n";
}
console.log(text);
const out = path.basename(fileName, path.extname(fileName)) + ".txt";
fs.writeFileSync(out, text);
// OUTPUT METHOD B
const fullTextAnnotation = result.fullTextAnnotation;
console.log(`Full text: ${fullTextAnnotation.text}`);
fullTextAnnotation.pages.forEach(page => {
page.blocks.forEach(block => {
console.log(`Block confidence: ${block.confidence}`);
block.paragraphs.forEach(paragraph => {
console.log(`Paragraph confidence: ${paragraph.confidence}`);
paragraph.words.forEach(word => {
const wordText = word.symbols.map(s => s.text).join("");
console.log(`Word text: ${wordText}`);
console.log(`Word confidence: ${word.confidence}`);
word.symbols.forEach(symbol => {
console.log(`Symbol text: ${symbol.text}`);
console.log(`Symbol confidence: ${symbol.confidence}`);
});
});
});
});
});
}
quickstart();

This question is already answered in this one.
In summary, the Demo is in this case probably using the DOCUMENT_TEXT_DETECTION, which can sometimes make a more thorough strings extraction, while you are using TEXT_DETECTION.
You can try to make a client.document_text_detection request instead of client.textDetection and you will probably get results closer to the Demo.
If you want to read to the related documentation you can find it here.
I hope this resolves your question!

Related

Puppeteer to convert html to pdf using Nodejs in Durable functions(fan out fan in)

I'm working on a small project to convert a large xml to several formatted pdf documents. The large xml contains multiple similar format xmls. So I'm using a single html template for printing all the documents. After producing all the pdf documents I also need to produce a metadata file with some basic info on each document that was printed.
I thought using the fan out fan in scenario of durable functions is a perfect for my use case. I'm working with Nodejs. I setup all my code and it seems to be working fine locally. The Orchestration function looks like the below.
const df = require("durable-functions");
module.exports = df.orchestrator(function* (context) {
var xmldata = yield context.df.callActivity("DurablegetblobJS1","");
var tasks = [];
for (file of xmldata) {
tasks.push(context.df.callActivity("Durableactivityjs2", file));
}
const outputs = yield context.df.Task.all(tasks);
var finalout = "";
for (out of outputs){
console.log('I am done1 :' + out );
finalout = finalout + out;
}
return finalout;
});
DurablegetblobJS1 : Fetches the entire xmls and splits it into multiple smaller xmls(1 per document).
Durableactivityjs2 : Fetches the html template, extracts the different values from the individual xmls and applies them to the html and finally prints out the pdf into an azure storage. It returns the name of the pdf document that was printed for creation of the metadata file. The code for this is below.
var fs = require('fs');
var xml2js = require('xml2js');
var html_to_pdf = require('html-pdf-node');
var parser = new xml2js.Parser();
module.exports = async function (context) {
//console.log("Hello from activity :")
var xmldict = {}
var xmltext = context.bindings.name;
//Extract the nodes and attributes
metadata(xmltext,xmldict);
report(xmltext,xmldict);
context.log(xmldict)
const { BlobServiceClient } = require("#azure/storage-blob");
// Load the .env file if it exists
require("dotenv").config();
const AZURE_STORAGE_CONNECTION_STRING = process.env.STORAGE_CONNECTION_STRING || "";
const blobServiceClient = BlobServiceClient.fromConnectionString(
AZURE_STORAGE_CONNECTION_STRING
);
var containerClient = blobServiceClient.getContainerClient('test');
var blobname = 'comb_template.html';
var blockBlobClient = containerClient.getBlockBlobClient(blobname);
var downloadBlockBlobResponse = await blockBlobClient.download(0);
var html_template = await streamToText(downloadBlockBlobResponse.readableStreamBody);
let options = { format: 'A4'};
let file = { content: html_template};
const x = await writepdf1(file, options,blobServiceClient,xmldict);
console.log("Written Blob PDF");
return x;
};
async function writepdf1(file, options,blobServiceClient,xmldict){
const pdfBuffer = await html_to_pdf.generatePdf(file, options);
const containerClient = blobServiceClient.getContainerClient('test2');
const targetblob = xmldict['OU'] + '/' + xmldict['ReportName'] + '/' + xmldict['OU'] + '_' + xmldict['ReportName'] + '_' + xmldict['DocumentID'] + '_' + '.pdf';
console.log('Blob name :' + targetblob);
const blockBlobClient_t = containerClient.getBlockBlobClient(targetblob);
const uploadBlobResponse = await blockBlobClient_t.upload(pdfBuffer, pdfBuffer.length);
return targetblob;
}
async function streamToText(readable) {
readable.setEncoding('utf8');
let data = '';
for await (const chunk of readable) {
data += chunk;
}
return data;
}
function metadata(xmltext,xmldict){
parser.parseString(xmltext, function (err, result) {
var test1 = result['HPDPSMsg']['DocumentRequest'][0]['MetaData'][0];
Object.entries(test1).forEach(([key, value]) => {
xmldict[key] = value[0];
});
});
}
function report(xmltext,xmldict){
parser.parseString(xmltext, function (err, result) {
var test2 = result['HPDPSMsg']['DocumentRequest'][0]['Report'][0]['$'];
Object.entries(test2).forEach(([key, value]) => {
xmldict[key] = value;
});
});
}
However, when I deploy the entire project into a azure premium function(EP1 - Windows), I see some errors in app insights when I try and execute my function and the pdfs are never generated.
Activity function 'Durableactivityjs2' failed: Could not find browser
revision 818858. Run "PUPPETEER_PRODUCT=firefox npm install" or
"PUPPETEER_PRODUCT=firefox yarn install" to download a supported
Firefox browser binary
I'm a bit clueless how I'm supposed to resolve this. Any help or suggestions would be appreciated.

Google Vision Automl fs.readFileSync alternative

I'm playing around with Google Vision autoML in react and I'm trying to figure out a way to change fs.readFileSync to an online url. I've used https, request and even url in node.js but I still can't figure out a way to make it read from a url source. Any help is appreciated, thank you
const projectId = "abc"
const location = "abc"
const modelId = "abc"
const filePath = "./test-pics/7.jpg"
const { PredictionServiceClient } = require("#google-cloud/automl").v1
const https = require("https")
const request = require("request")
const url = require("node:url")
const fs = require("fs")
const client = new PredictionServiceClient({
keyFilename: "./vision-private-key.json",
})
const imageURL = "https://cdn.filestackcontent.com/IJ0kViHQQyiwQTVaFH66"
// Read the file content for translation.
const content = fs.readFileSync(filePath)
const content1 = https.get(imageURL)
const content2 = request.get(
"https://cdn.filestackcontent.com/IJ0kViHQQyiwQTVaFH66"
)
async function predict() {
const request = {
name: client.modelPath(projectId, location, modelId),
payload: {
image: {
imageBytes: content,
},
},
}
const [response] = await client.predict(request)
for (const annotationPayload of response.payload) {
console.log(`type of vehicle: ${annotationPayload.displayName}`)
const carType = annotationPayload.displayName
console.log(carType)
}
}
predict()

Opensea listing through Opensea-js is not working

I have created a nft and is listed in OpenSea. Now I am trying to create sell order of my item through opensea-js sdk. Unfortunately it is not working. Do not know where I am making a mistake. Also I am not sure on base derivation path. Below is my code to create sell order. Pls help me resolving this.
const opensea = require("opensea-js");
const OpenSeaPort = opensea.OpenSeaPort;
const Network = opensea.Network;
const MnemonicWalletSubprovider = require("#0x/subproviders")
.MnemonicWalletSubprovider;
const RPCSubprovider = require("web3-provider-engine/subproviders/rpc");
const Web3ProviderEngine = require("web3-provider-engine");
const MNEMONIC = "accuse never ....";
const NFT_CONTRACT_ADDRESS = "0x6C317E7dE3e8823BBc308a2912Ba6F24587fc167";
const OWNER_ADDRESS = "0x589a1532AAaE84e38345b58C11CF4697Ea89A866";
API_KEY = "";
const infuraRpcSubprovider = new RPCSubprovider({
rpcUrl: "https://rinkeby.infura.io/v3/c0e4482bdf9e4f539692666cd56ef6e4"
});
const BASE_DERIVATION_PATH = `44'/60'/0'/0`;
const mnemonicWalletSubprovider = new MnemonicWalletSubprovider({
mnemonic: MNEMONIC,
baseDerivationPath: BASE_DERIVATION_PATH,
chainId: 4
});
const providerEngine = new Web3ProviderEngine();
providerEngine.addProvider(mnemonicWalletSubprovider);
providerEngine.addProvider(infuraRpcSubprovider);
providerEngine.start();
const seaport = new OpenSeaPort(
providerEngine,
{
networkName: Network.Rinkeby,
apiKey: API_KEY,
},
(arg) => console.log(arg)
);
async function main() {
console.log("Auctioning an item for a fixed price...");
const fixedPriceSellOrder = await seaport.createSellOrder({
asset: {
tokenId: "3",
tokenAddress: NFT_CONTRACT_ADDRESS,
},
startAmount: 0.0001,
expirationTime: 0,
accountAddress: OWNER_ADDRESS,
}) ;
console.log("fixedPriceSellOrder") ;
}
main();
This has been resolved. I have changed the HDProvider to #truffle/hdwallet-provider. Now I could see the listing in opensea after createSellOrder through opensea-js.
This link helped me to get this resolved

Save CVS from the web to Apify Dataset

I am trying to get some CVS data from google sheet and store it into an Apify dataset.
const Apify = require('apify');
const request = require('request-promise');
Apify.main(async () => {
var URL = "https://docs.google.com/spreadsheets/d/1-auXklWqHQ-jj6AXymMPa7FLtP1eYGJGF3rprxuWitk/gviz/tq?tqx=out:csv";
const html = await request(URL);
console.log('My output:');
console.log(html);
await Apify.setValue('OUTPUT', html);
const namedDataset = await Apify.openDataset();
await namedDataset.pushData(html);
});
Here is error message:
2020-01-01T16:43:21.501Z My output:
2020-01-01T16:43:21.510Z "city","country"
2020-01-01T16:43:21.512Z "Berlin ","Germany"
2020-01-01T16:43:21.513Z "Los Angeles","United States"
2020-01-01T16:43:21.514Z "Melbourne","Australia"
2020-01-01T16:43:21.516Z "Sydney","Australia"
2020-01-01T16:43:21.517Z "London","United Kingdom"
2020-01-01T16:43:21.519Z "New York City","United States"
2020-01-01T16:43:21.614Z ERROR: The function passed to Apify.main() threw an exception: (error details: type=invalid-parameter)
2020-01-01T16:43:21.616Z ApifyClientError: Parameter "data" of type Array | Object must be provided
2020-01-01T16:43:21.617Z at exports.checkParamOrThrow (/usr/src/app/node_modules/apify-client/build/utils.js:222:15)
2020-01-01T16:43:21.619Z at Dataset.pushData (/usr/src/app/node_modules/apify/build/dataset.js:222:34)
2020-01-01T16:43:21.620Z at Apify.main (/usr/src/app/main.js:16:22)
2020-01-01T16:43:21.621Z at process._tickCallback (internal/process/next_tick.js:68:7)
A more elegant solution would be using our Google Sheets actor.
const Apify = require('apify');
Apify.main(async () => {
const spreadsheetId = '1-auXklWqHQ-jj6AXymMPa7FLtP1eYGJGF3rprxuWitk';
const sheetsActorInput = {
mode: 'read',
spreadsheetId,
};
const data = await Apify.call('lukaskrivka/google-sheets', sheetsActorInput);
const namedDataset = await Apify.openDataset('my-dataset');
await namedDataset.pushData(data);
});
The only disadvantage (also an advantage is some sense) is that you need to authorize in your first run but that is really simple.
I was able to use this somewhat hacky approach. I am sure their is a more modern elgagent approach:
const Apify = require('apify');
const request = require('request-promise');
function csvJSON(csv) { //https://stackoverflow.com/a/27979069/2330272
var lines = csv.split("\n");
var result = [];
// NOTE: If your columns contain commas in their values, you'll need
// to deal with those before doing the next step
// (you might convert them to &&& or something, then covert them back later)
// jsfiddle showing the issue https://jsfiddle.net/
var headers = lines[0].split(",");
for (var i = 1; i < lines.length; i++) {
var obj = {};
var currentline = lines[i].split(",");
for (var j = 0; j < headers.length; j++) {
obj[headers[j]] = currentline[j];
}
result.push(obj);
}
return JSON.stringify(result); //JSON
}
Apify.main(async () => {
var URL = "https://docs.google.com/spreadsheets/d/1-auXklWqHQ-jj6AXymMPa7FLtP1eYGJGF3rprxuWitk/gviz/tq?tqx=out:csv"; //test
const html = await request(URL);
const urls = csvJSON(html.replace(/\"/g, "")); // remove quotes from csv data
console.log('My output:');
const namedDataset = await Apify.openDataset();
await namedDataset.pushData(JSON.parse(urls));
});

All my scraped text ends up in one big object instead of separate objects with Cheerio

I'm following a web scraping course that uses Cheerio. I practice on a different website then they use in the course and now I run into the problem that all my scraped text end up in one big object. But every title should end up in it's own object. Can someone see what I did wrong? I already bumbed my head 2 hours on this problem.
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.aanbod").each((index, element) => {
const result = $(element).children(".item");
const title = result.find("h2").text().trim();
const characteristics = result.find("h4").text();
const scrapeResult = {title, characteristics};
scrapeResults.push(scrapeResult);
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();
This is the link to the repo: https://github.com/danielkroon/huurgoed-scraper/blob/master/index.js
Thanks!
That is because of the way you used selectors. I've modified your script to fetch the content as you expected. Currently the script is collecting titles and characteristics. Feel free to add the rest within your script.
This is how you can get the required output:
const request = require('request-promise');
const cheerio = require('cheerio');
const url = "https://huurgoed.nl/gehele-aanbod";
const scrapeResults = [];
async function scrapeHuurgoed() {
try {
const htmlResult = await request.get(url);
const $ = await cheerio.load(htmlResult);
$("div.item").each((index, element) => {
const title = $(element).find(".kenmerken > h2").text().trim();
const characteristics = $(element).find("h4").text().trim();
scrapeResults.push({title,characteristics});
});
console.log(scrapeResults);
} catch(err) {
console.error(err);
}
}
scrapeHuurgoed();

Resources