Computer Vision REST API format - azure

I am currently using the trial version of the Computer Vision API in java, So I acquired the code from the website and successfully got the JSON.
However, the format of the JSON I got was quite different than the one showed in the Demo page.
Example my Json response:
"regions": [
{
"boundingBox": "21,16,304,451",
"lines": [
{
"boundingBox": "28,16,288,41",
"words": [
{
"boundingBox": "28,16,288,41",
"text": "NOTHING"
}
]
},
Whereas the demo page is:
{
"lines": [
{
"boundingBox": [
122,
122,
401,
85,
404,
229,
143,
233
],
looking at the bounding box format, we can clearly see the difference

The response you get is the result of using the Computer Vision API's OCR as the example states:
{
"language": "en",
"textAngle": -2.0000000000000338,
"orientation": "Up",
"regions": [
{
"boundingBox": "462,379,497,258",
"lines": [
{
"boundingBox": "462,379,497,74",
"words": [
{
"boundingBox": "462,379,41,73",
"text": "A"
},
{
"boundingBox": "523,379,153,73",
"text": "GOAL"
},
{
"boundingBox": "694,379,265,74",
"text": "WITHOUT"
}
]
},
{
"boundingBox": "565,471,289,74",
"words": [
{
"boundingBox": "565,471,41,73",
"text": "A"
},
{
"boundingBox": "626,471,150,73",
"text": "PLAN"
},
{
"boundingBox": "801,472,53,73",
"text": "IS"
}
]
},
{
"boundingBox": "519,563,375,74",
"words": [
{
"boundingBox": "519,563,149,74",
"text": "JUST"
},
{
"boundingBox": "683,564,41,72",
"text": "A"
},
{
"boundingBox": "741,564,153,73",
"text": "WISH"
}
]
}
]
}
]
}
While the response from the demo page is the result of using the Computer Vision API's Recognize Text then Get Recognize Text Operation Result to get the result of the operation as the example states:
{
"status": "Succeeded",
"recognitionResult": {
"lines": [
{
"boundingBox": [
202,
618,
2047,
643,
2046,
840,
200,
813
],
"text": "Our greatest glory is not",
"words": [
{
"boundingBox": [
204,
627,
481,
628,
481,
830,
204,
829
],
"text": "Our"
},
{
"boundingBox": [
519,
628,
1057,
630,
1057,
832,
518,
830
],
"text": "greatest"
},
{
"boundingBox": [
1114,
630,
1549,
631,
1548,
833,
1114,
832
],
"text": "glory"
},
{
"boundingBox": [
1586,
631,
1785,
632,
1784,
834,
1586,
833
],
"text": "is"
},
{
"boundingBox": [
1822,
632,
2115,
633,
2115,
835,
1822,
834
],
"text": "not"
}
]
},
{
"boundingBox": [
420,
1273,
2954,
1250,
2958,
1488,
422,
1511
],
"text": "but in rising every time we fall",
"words": [
{
"boundingBox": [
423,
1269,
634,
1268,
635,
1507,
424,
1508
],
"text": "but"
},
{
"boundingBox": [
667,
1268,
808,
1268,
809,
1506,
668,
1507
],
"text": "in"
},
{
"boundingBox": [
874,
1267,
1289,
1265,
1290,
1504,
875,
1506
],
"text": "rising"
},
{
"boundingBox": [
1331,
1265,
1771,
1263,
1772,
1502,
1332,
1504
],
"text": "every"
},
{
"boundingBox": [
1812,
1263,
2178,
1261,
2179,
1500,
1813,
1502
],
"text": "time"
},
{
"boundingBox": [
2219,
1261,
2510,
1260,
2511,
1498,
2220,
1500
],
"text": "we"
},
{
"boundingBox": [
2551,
1260,
3016,
1258,
3017,
1496,
2552,
1498
],
"text": "fall"
}
]
},
{
"boundingBox": [
1612,
903,
2744,
935,
2738,
1139,
1607,
1107
],
"text": "in never failing ,",
"words": [
{
"boundingBox": [
1611,
934,
1707,
933,
1708,
1147,
1613,
1147
],
"text": "in"
},
{
"boundingBox": [
1753,
933,
2132,
930,
2133,
1144,
1754,
1146
],
"text": "never"
},
{
"boundingBox": [
2162,
930,
2673,
927,
2674,
1140,
2164,
1144
],
"text": "failing"
},
{
"boundingBox": [
2703,
926,
2788,
926,
2790,
1139,
2705,
1140
],
"text": ","
}
]
}
]
}
}

Related

I have a JSON formatted output as below, how to access and valid the outputs of each node with Assert statement

I have a JSON formatted output as below, how to access and valid the outputs of each node with Assert statement
{
"Type": "Page",
"X": 0,
"Y": 0,
"Width": 696,
"Height": 888,
"Children": [
{
"Type": "Column",
"X": 0,
"Y": 0,
"Width": 696,
"Height": 888,
"Children": [
{
"Type": "Paragraph",
"X": 209,
"Y": 290,
"Width": 248,
"Height": 24,
"Children": [
{
"Type": "Line",
"X": 209,
"Y": 290,
"Width": 248,
"Height": 24,
"Children": [
{
"Type": "Word",
"X": 209,
"Y": 290,
"Width": 49,
"Height": 24,
"Children": [
],
"Content": "Core"
},
{
"Type": "Word",
"X": 263,
"Y": 290,
"Width": 106,
"Height": 24,
"Children": [
],
"Content": "Enterprise"
},
{
"Type": "Word",
"X": 375,
"Y": 290,
"Width": 82,
"Height": 24,
"Children": [
],
"Content": "Installer"
}
],
"Content": null
}
],
"Content": null
},
{
"Type": "Paragraph",
"X": 580,
"Y": 803,
"Width": 79,
"Height": 13,
"Children": [
{
"Type": "Line",
"X": 580,
"Y": 803,
"Width": 79,
"Height": 13,
"Children": [
{
"Type": "Word",
"X": 580,
"Y": 803,
"Width": 46,
"Height": 13,
"Children": [
],
"Content": "Version"
},
{
"Type": "Word",
"X": 629,
"Y": 803,
"Width": 12,
"Height": 13,
"Children": [
],
"Content": "8."
},
{
"Type": "Word",
"X": 640,
"Y": 803,
"Width": 12,
"Height": 13,
"Children": [
],
"Content": "0."
},
{
"Type": "Word",
"X": 651,
"Y": 803,
"Width": 8,
"Height": 13,
"Children": [
],
"Content": "0"
}
],
"Content": null
}
],
"Content": null
}
],
"Content": null
}
],
"Content": null
}
Looking for solutions

Create pandas dataframe based on row search substring value

I am struggling to shape my Python data into a dataframe. Can anyone help me with the code that might get me there? It seems the easiest solution would be to create columns based on substrings of text from the rows but I cannot find documentation to get me the shape I am seeking from the rows.
Original Dataframe - no column headers, data all in rows
Desired Dataframe - bounding box rows to columns with uniform header, confidence to column
My response is structured as follows:
{
"status": "succeeded",
"createdDateTime": "2020-08-28T19:21:29Z",
"lastUpdatedDateTime": "2020-08-28T19:21:31Z",
"analyzeResult": {
"version": "3.0.0",
"readResults": [{
"page": 1,
"angle": 0.1296,
"width": 1700,
"height": 2200,
"unit": "pixel",
"lines": [{
"boundingBox": [
182,
119,
383,
119,
383,
161,
182,
160
],
"text": "FORM 101",
"words": [{
"boundingBox": [
183,
120,
305,
120,
305,
161,
182,
161
],
"text": "FORM",
"confidence": 0.987
},
{
"boundingBox": [
318,
120,
381,
120,
382,
162,
318,
161
],
"text": "101",
"confidence": 0.987
}
]
},
{
"boundingBox": [
578,
129,
1121,
129,
1121,
163,
578,
162
],
"text": "The Commonwealth of Massachusetts",
"words": [{
"boundingBox": [
579,
129,
634,
129,
634,
162,
579,
161
],
"text": "The",
"confidence": 0.988
},
{
"boundingBox": [
641,
129,
868,
129,
866,
164,
640,
162
],
"text": "Commonwealth",
"confidence": 0.979
},
{
"boundingBox": [
874,
129,
902,
129,
900,
164,
872,
164
],
"text": "of",
"confidence": 0.988
},
{
"boundingBox": [
908,
129,
1120,
130,
1117,
163,
906,
164
],
"text": "Massachusetts",
"confidence": 0.977
}
]
},
{
"boundingBox": [
1341,
137,
1540,
138,
1540,
164,
1341,
163
],
"text": "DIA USE ONLY",
"words": [{
"boundingBox": [
1342,
138,
1392,
138,
1392,
164,
1341,
163
],
"text": "DIA",
"confidence": 0.983
},
{
"boundingBox": [
1397,
138,
1452,
139,
1452,
164,
1397,
164
],
"text": "USE",
"confidence": 0.983
},
{
"boundingBox": [
1457,
139,
1539,
138,
1540,
164,
1457,
164
],
"text": "ONLY",
"confidence": 0.986
}
]
},
{
"boundingBox": [
459,
169,
1235,
168,
1235,
202,
459,
203
],
"text": "Department of Industrial Accidents - Department 101",
"words": [{
"boundingBox": [
460,
170,
634,
170,
634,
203,
460,
204
],
"text": "Department",
"confidence": 0.981
},
{
"boundingBox": [
640,
170,
669,
170,
669,
203,
640,
203
],
"text": "of",
"confidence": 0.983
},
{
"boundingBox": [
676,
170,
821,
169,
821,
203,
676,
203
],
"text": "Industrial",
"confidence": 0.981
},
{
"boundingBox": [
828,
169,
967,
169,
966,
203,
828,
203
],
"text": "Accidents",
"confidence": 0.952
},
{
"boundingBox": [
973,
169,
993,
169,
993,
203,
973,
203
],
"text": "-",
"confidence": 0.983
},
{
"boundingBox": [
1000,
169,
1176,
169,
1176,
203,
999,
203
],
"text": "Department",
"confidence": 0.982
},
{
"boundingBox": [
1183,
169,
1236,
169,
1235,
203,
1182,
203
],
"text": "101",
"confidence": 0.987
}
]
},
{
"boundingBox": [
511,
205,
1189,
205,
1189,
233,
511,
234
],
"text": "1 Congress Street, Suite 100, Boston, Massachusetts 02114-2017",
"words": [{
"boundingBox": [
513,
206,
520,
206,
519,
233,
512,
233
],
"text": "1",
"confidence": 0.974
},
{
"boundingBox": [
525,
206,
625,
206,
624,
234,
524,
233
],
"text": "Congress",
"confidence": 0.981
},
{
"boundingBox": [
630,
206,
702,
206,
701,
234,
629,
234
],
"text": "Street,",
"confidence": 0.977
},
{
"boundingBox": [
707,
206,
763,
206,
762,
234,
706,
234
],
"text": "Suite",
"confidence": 0.983
},
{
"boundingBox": [
769,
206,
812,
206,
811,
234,
767,
234
],
"text": "100,",
"confidence": 0.983
},
{
"boundingBox": [
818,
206,
898,
206,
897,
234,
816,
234
],
"text": "Boston,",
"confidence": 0.983
},
{
"boundingBox": [
903,
206,
1059,
205,
1058,
234,
902,
234
],
"text": "Massachusetts",
"confidence": 0.975
},
{
"boundingBox": [
1064,
205,
1189,
205,
1187,
233,
1063,
234
],
"text": "02114-2017",
"confidence": 0.978
}
]
},
{
"boundingBox": [
422,
236,
1279,
237,
1279,
263,
422,
263
],
"text": "Info. Line 800-323-3249 ext. 470 in Mass. Outside Mass. - 617-727-4900 ext. 470",
"words": [{
"boundingBox": [
423,
237,
472,
237,
472,
263,
422,
263
],
"text": "Info.",
"confidence": 0.983
},
{
"boundingBox": [
477,
237,
526,
237,
526,
264,
477,
264
],
"text": "Line",
"confidence": 0.986
},
{
"boundingBox": [
531,
237,
674,
237,
674,
264,
531,
264
],
"text": "800-323-3249",
"confidence": 0.977
},
{
"boundingBox": [
679,
237,
718,
237,
718,
264,
679,
264
],
"text": "ext.",
"confidence": 0.982
},
{
"boundingBox": [
724,
237,
763,
237,
763,
264,
723,
264
],
"text": "470",
"confidence": 0.986
},
{
"boundingBox": [
768,
237,
790,
237,
790,
264,
768,
264
],
"text": "in",
"confidence": 0.987
},
{
"boundingBox": [
795,
237,
865,
237,
865,
264,
795,
264
],
"text": "Mass.",
"confidence": 0.983
},
{
"boundingBox": [
870,
237,
953,
237,
953,
264,
870,
264
],
"text": "Outside",
"confidence": 0.981
},
{
"boundingBox": [
958,
237,
1019,
237,
1020,
264,
958,
264
],
"text": "Mass.",
"confidence": 0.984
},
{
"boundingBox": [
1025,
237,
1036,
237,
1037,
264,
1025,
264
],
"text": "-",
"confidence": 0.983
},
{
"boundingBox": [
1042,
237,
1184,
237,
1185,
264,
1042,
264
],
"text": "617-727-4900",
"confidence": 0.975
},
{
"boundingBox": [
1190,
237,
1229,
238,
1229,
264,
1190,
264
],
"text": "ext.",
"confidence": 0.985
},
{
"boundingBox": [
1234,
238,
1278,
238,
1278,
264,
1234,
264
],
"text": "470",
"confidence": 0.983
}
]
},
{
"boundingBox": [
716,
264,
984,
266,
984,
293,
715,
292
],
"text": "http://www.mass.gov/dia",
"words": [{
"boundingBox": [
717,
265,
985,
267,
984,
294,
716,
293
],
"text": "http://www.mass.gov/dia",
"confidence": 0.952
}]
},
{
"boundingBox": [
398,
299,
1289,
299,
1289,
342,
398,
342
],
"text": "EMPLOYER'S FIRST REPORT OF INJURY",
"words": [{
"boundingBox": [
399,
300,
693,
300,
693,
341,
399,
343
],
"text": "EMPLOYER'S",
"confidence": 0.98
},
{
"boundingBox": [
702,
300,
836,
300,
836,
341,
702,
341
],
"text": "FIRST",
"confidence": 0.982
},
{
"boundingBox": [
845,
300,
1036,
300,
1036,
341,
844,
341
],
"text": "REPORT",
"confidence": 0.985
},
{
"boundingBox": [
1045,
300,
1105,
300,
1104,
342,
1044,
341
],
"text": "OF",
"confidence": 0.988
},
{
"boundingBox": [
1113,
300,
1288,
299,
1287,
343,
1113,
342
],
"text": "INJURY",
"confidence": 0.986
}
]
},
{
"boundingBox": [
691,
354,
1005,
355,
1005,
395,
691,
393
],
"text": "OR FATALITY",
"words": [{
"boundingBox": [
691,
354,
760,
355,
760,
395,
692,
394
],
"text": "OR",
"confidence": 0.988
},
{
"boundingBox": [
768,
355,
1005,
356,
1003,
395,
768,
395
],
"text": "FATALITY",
"confidence": 0.981
}
]
}
]
}]
}
}
Without supplying your data or an explanation this mostly does what you want.
comments explain approach
there is more work to be done on linekey however I cannot see the relationship between the actual data and the outcome you posted as an image
import re
import numpy as np
import pandas as pd
df = pd.DataFrame(
{0:["analyzeResult_readResults_0_lines_0_text","analyzeResult_readResults_0_lines_0_words_0_boundingBox_0","analyzeResult_readResults_0_lines_0_words_0_boundingBox_1","analyzeResult_readResults_0_lines_0_words_0_boundingBox_2","analyzeResult_readResults_0_lines_0_words_0_boundingBox_3","analyzeResult_readResults_0_lines_0_words_0_boundingBox_4","analyzeResult_readResults_0_lines_0_words_0_boundingBox_5","analyzeResult_readResults_0_lines_0_words_0_boundingBox_6","analyzeResult_readResults_0_lines_0_words_0_boundingBox_7","analyzeResult_readResults_0_lines_0_words_0_text","analyzeResult_readResults_0_lines_0_words_0_confidence","analyzeResult_readResults_0_lines_0_words_1_boundingBox_0","analyzeResult_readResults_0_lines_0_words_1_boundingBox_1","analyzeResult_readResults_0_lines_0_words_1_boundingBox_2","analyzeResult_readResults_0_lines_0_words_1_boundingBox_3","analyzeResult_readResults_0_lines_0_words_1_boundingBox_4","analyzeResult_readResults_0_lines_0_words_1_boundingBox_5","analyzeResult_readResults_0_lines_0_words_1_boundingBox_6","analyzeResult_readResults_0_lines_0_words_1_boundingBox_7","analyzeResult_readResults_0_lines_0_words_1_text","analyzeResult_readResults_0_lines_0_words_1_confidence","analyzeResult_readResults_0_lines_1_boundingBox_0","analyzeResult_readResults_0_lines_1_boundingBox_1","analyzeResult_readResults_0_lines_1_boundingBox_2","analyzeResult_readResults_0_lines_1_boundingBox_3","analyzeResult_readResults_0_lines_1_boundingBox_4","analyzeResult_readResults_0_lines_1_boundingBox_5","analyzeResult_readResults_0_lines_1_boundingBox_6","analyzeResult_readResults_0_lines_1_boundingBox_7"],
1:["FORM 101",183,120,305,120,305,161,182,161,"FORM",0.987,318,120,381,120,382,162,318,161,101,0.987,578,129,1121,129,1121,163,578,162],
},
index=[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45]
)
df = (
df
.rename(columns={0:"key",1:"val"})
.assign(
b=lambda x: x["key"].str.extract("(.*)_bounding"),
c=lambda x: x["key"].str.extract("(.*)_confidence"),
# linekey is everything before "_bounding" or "_confidence". pull the two together
linekey=lambda x: np.where(x["b"].isna(),
np.where(x["c"].isna(), x["key"], x["c"]),
x["b"]),
# column key is every thing after line key minus leading "_"
colkey=lambda x: x.apply(lambda r: r["key"].replace(r["linekey"], "").strip("_"), axis=1)
)
.assign(
# cleanup special case line keys...
colkey=lambda x: np.where(x["colkey"]=="", "Value", x["colkey"].replace("confidence","Confidence"))
)
# remove working columns
.drop(columns=["b","c","key"])
# mixed values and strings so use "first" and unstack to get to desired layout
.groupby(["linekey","colkey"]).agg({"val":"first"}).unstack()
)
print(df.to_string())
output
val
colkey Confidence Value boundingBox_0 boundingBox_1 boundingBox_2 boundingBox_3 boundingBox_4 boundingBox_5 boundingBox_6 boundingBox_7
linekey
analyzeResult_readResults_0_lines_0_text NaN FORM 101 NaN NaN NaN NaN NaN NaN NaN NaN
analyzeResult_readResults_0_lines_0_words_0 0.987 NaN 183 120 305 120 305 161 182 161
analyzeResult_readResults_0_lines_0_words_0_text NaN FORM NaN NaN NaN NaN NaN NaN NaN NaN
analyzeResult_readResults_0_lines_0_words_1 0.987 NaN 318 120 381 120 382 162 318 161
analyzeResult_readResults_0_lines_0_words_1_text NaN 101 NaN NaN NaN NaN NaN NaN NaN NaN
analyzeResult_readResults_0_lines_1 NaN NaN 578 129 1121 129 1121 163 578 162

"Expected Iterable, but did not find one for field HolidayPackageType.hotelCategory."

mutation AddHolidayPackages($packageName:String, $destination:String,$country:String,$numberOfNights:Int,
$citiesCovered:String,$highlights:String,$includes:String,$excludes:String,$dayWiseItinerary:[String],
$termsandConditions:String,$cancellationPolicy:String,$remarks:String,$notes:String,$images:[String],$hotelCategory:HotelCategoryInput)
{
addHolidayPackages(packageName: $packageName,destination:$destination,country:$country,numberOfNights:$numberOfNights,
citiesCovered:$citiesCovered,highlights:$highlights,includes:$includes,excludes:$excludes,dayWiseItinerary:$dayWiseItinerary,
termsandConditions:$termsandConditions,cancellationPolicy:$cancellationPolicy,remarks:$remarks,notes:$notes,images:$images,hotelCategory:$hotelCategory)
{
packageName,
destination,
country,
numberOfNights,
citiesCovered,
highlights,
includes,
excludes,
dayWiseItinerary,
termsandConditions,
cancellationPolicy,
remarks,
notes,
images,
hotelCategory {
a
b
c
d
e
}
}
}
{
"packageName": "vinay",
"destination": "Thailand",
"country": "Thailand",
"numberOfNights": 5,
"citiesCovered": "tgr,tht",
"highlights": "yhtyhjtrtjtjjtjtjtrjng",
"includes": "ugbtegbtgtbbrihgrhtg",
"excludes": "bbregrgvrdww fekfjnlnfkef",
"dayWiseItinerary": [
"day1 refjrek3fbkjbefkjr",
"day2:hvrfjvr3ef"
],
"termsandConditions": "hbvfvkjhbvgkjbftg",
"cancellationPolicy": "ktgkntrljnbtrngb",
"remarks": "mkntgbntnrbhrkjtgn",
"notes": "btjgkbtjkbgjtbgkj",
"images": ["rfvgr","trgtrgh","trgregh"],
"hotelCategory": {
"a": [
{
"hotelName": "hghtgh","rgrgreg","rgreg",
"adult": 233,
"child": 545,
"infant": 677
}
],
"b": [
{
"hotelName": "hghtgh,"rgrgreg,"rgreg",
"adult": 233,
"child": 545,
"infant": 677
}
],
"c": [
{
"hotelName": "hghtgh","rgrgreg","rgreg",
"adult": 233,
"child": 545,
"infant": 677
}
],
"d": [
{
"hotelName": "hghtgh,rgrgreg,rgreg",
"adult": 233,
"child": 545,
"infant": 677
}
],
"e": [
{
"hotelName": "hghtgh,rgrgreg,rgreg",
"adult": 233,
"child": 545,
"infant": 677
}
]
}
}
You seem to be using an array type in place of an object type. Do look at your types (hotelCategoryType) and look if it is matching your mutation or vice versa.

Azure OCR [printed text] is not reading the receipt lines in the right order

Application Goal: read the receipt image, extract the store/organization name along with the total amount paid. Feed it to web-form for auto-filling & submission.
Post Request - "https://*.cognitiveservices.azure.com/vision/v2.0/recognizeText?{params}
Get Request - https://*.cognitiveservices.azure.com/vision/v2.0/textOperations/{operationId}
however when I get the results back, sometimes it's confusing in line ordering (see below picture [similar results in JSON response])
This mixing is resulting in getting the total as $0.88
Similar situations are present for 2 out of 9 testing receipts.
Q: Why it's working for similar & different structured receipts but for some reason not consistent for all? Also, any ideas how to get around it?
I had a quick look to your case.
OCR Result
As you mentioned, the results are not ordered as you thought. I had a quick look to the bounding boxes values and I don't know how they are ordered. You could try to consolidate fields based on that, but there is a service that is already doing it for you.
Form Recognizer:
Using Form Recognizer and your image, I got the following results for your receipt.
As you can see below, the understandingResults contains the total with its value ("value": 9.11), the MerchantName ("Chick-fil-a") and other fields.
{
"status": "Succeeded",
"recognitionResults": [
{
"page": 1,
"clockwiseOrientation": 0.17,
"width": 404,
"height": 1226,
"unit": "pixel",
"lines": [
{
"boundingBox": [
108,
55,
297,
56,
296,
71,
107,
70
],
"text": "Welcome to Chick-fil-a",
"words": [
{
"boundingBox": [
108,
56,
169,
56,
169,
71,
108,
71
],
"text": "Welcome",
"confidence": "Low"
},
{
"boundingBox": [
177,
56,
194,
56,
194,
71,
177,
71
],
"text": "to"
},
{
"boundingBox": [
201,
56,
296,
57,
296,
71,
201,
71
],
"text": "Chick-fil-a"
}
]
},
...
OTHER LINES CUT FOR DISPLAY
...
]
}
],
"understandingResults": [
{
"pages": [
1
],
"fields": {
"Subtotal": null,
"Total": {
"valueType": "numberValue",
"value": 9.11,
"text": "$9.11",
"elements": [
{
"$ref": "#/recognitionResults/0/lines/32/words/0"
},
{
"$ref": "#/recognitionResults/0/lines/32/words/1"
}
]
},
"Tax": {
"valueType": "numberValue",
"value": 0.88,
"text": "$0.88",
"elements": [
{
"$ref": "#/recognitionResults/0/lines/31/words/0"
},
{
"$ref": "#/recognitionResults/0/lines/31/words/1"
},
{
"$ref": "#/recognitionResults/0/lines/31/words/2"
}
]
},
"MerchantAddress": null,
"MerchantName": {
"valueType": "stringValue",
"value": "Chick-fil-a",
"text": "Chick-fil-a",
"elements": [
{
"$ref": "#/recognitionResults/0/lines/0/words/2"
}
]
},
"MerchantPhoneNumber": {
"valueType": "stringValue",
"value": "+13092689500",
"text": "309-268-9500",
"elements": [
{
"$ref": "#/recognitionResults/0/lines/4/words/0"
}
]
},
"TransactionDate": {
"valueType": "stringValue",
"value": "2019-06-21",
"text": "6/21/2019",
"elements": [
{
"$ref": "#/recognitionResults/0/lines/6/words/0"
}
]
},
"TransactionTime": {
"valueType": "stringValue",
"value": "13:00:57",
"text": "1:00:57 PM",
"elements": [
{
"$ref": "#/recognitionResults/0/lines/6/words/1"
},
{
"$ref": "#/recognitionResults/0/lines/6/words/2"
}
]
}
}
}
]
}
More details on Form Recognizer: https://azure.microsoft.com/en-us/services/cognitive-services/form-recognizer/

Redact a JSON response using Python

I am using Cognitive Read API to extract the text from an image. I am getting the response as below:
{
"status": "Succeeded",
"recognitionResult": {
"lines": [
{
"boundingBox": [
2,
52,
65,
46,
69,
89,
7,
95
],
"text": "$230",
"words": [
{
"boundingBox": [
0,
59,
63,
43,
77,
86,
3,
102
],
"text": "$230"
}
]
},
{
"boundingBox": [
6,
2,
771,
13,
770,
75,
5,
64
],
"text": "The quick brown fox jumps over the lazy",
"words": [
{
"boundingBox": [
0,
4,
92,
5,
77,
71,
0,
71
],
"text": "The"
},
{
"boundingBox": [
74,
4,
189,
5,
174,
72,
60,
71
],
"text": "quick"
},
{
"boundingBox": [
176,
5,
321,
6,
306,
73,
161,
72
],
"text": "brown"
},
{
"boundingBox": [
308,
6,
387,
6,
372,
73,
293,
73
],
"text": "fox"
},
{
"boundingBox": [
382,
6,
506,
7,
491,
74,
368,
73
],
"text": "jumps"
},
{
"boundingBox": [
492,
7,
607,
8,
592,
75,
478,
74
],
"text": "over"
},
{
"boundingBox": [
589,
8,
673,
8,
658,
75,
575,
75
],
"text": "the"
},
{
"boundingBox": [
660,
8,
783,
9,
768,
76,
645,
75
],
"text": "lazy"
}
]
},
{
"boundingBox": [
2,
84,
783,
96,
782,
154,
1,
148
],
"text": "$78,000.00",
"words": [
{
"boundingBox": [
0,
86,
94,
87,
72,
151,
0,
149
],
"text": "$78,000.00"
},
{
"boundingBox": [
76,
87,
164,
88,
142,
152,
54,
150
],
"text": "my"
},
{
"boundingBox": [
155,
88,
243,
89,
222,
152,
134,
151
],
"text": "box"
},
{
"boundingBox": [
226,
89,
344,
90,
323,
154,
204,
152
],
"text": "with"
},
{
"boundingBox": [
336,
90,
432,
91,
411,
154,
314,
154
],
"text": "five"
},
{
"boundingBox": [
419,
91,
538,
92,
516,
154,
398,
154
],
"text": "dozen"
},
{
"boundingBox": [
547,
92,
701,
94,
679,
154,
525,
154
],
"text": "liquor"
},
{
"boundingBox": [
696,
94,
800,
95,
780,
154,
675,
154
],
"text": "jugs"
}
]
}
]
}
}
Now my requirement is to mask all the amounts($230,$78,000.00) in the Json response with XXXXX or black out those numbers.
I am using azure notebooks to build the code in python 3.6.
Please help if anyone worked on this earlier.
Thank You!
You can't control the JSON response, but you can control what you show to your users from the response!
You can check for each text returned if it includes the $ character and remove everything that comes after it!
Here's an example:
text = 'some text$ this part will be removed.'
head, sep, tail = text.partition('$')
print(head) will show "some text"
print(sep) will show "$"
print(tail) will show " this part will be removed."

Resources