Redact a JSON response using Python

Redact a JSON response using Python - python-3.x

I am using Cognitive Read API to extract the text from an image. I am getting the response as below:
{
"status": "Succeeded",
"recognitionResult": {
"lines": [
{
"boundingBox": [
2,
52,
65,
46,
69,
89,
7,
95
],
"text": "$230",
"words": [
{
"boundingBox": [
0,
59,
63,
43,
77,
86,
3,
102
],
"text": "$230"
}
]
},
{
"boundingBox": [
6,
2,
771,
13,
770,
75,
5,
64
],
"text": "The quick brown fox jumps over the lazy",
"words": [
{
"boundingBox": [
0,
4,
92,
5,
77,
71,
0,
71
],
"text": "The"
},
{
"boundingBox": [
74,
4,
189,
5,
174,
72,
60,
71
],
"text": "quick"
},
{
"boundingBox": [
176,
5,
321,
6,
306,
73,
161,
72
],
"text": "brown"
},
{
"boundingBox": [
308,
6,
387,
6,
372,
73,
293,
73
],
"text": "fox"
},
{
"boundingBox": [
382,
6,
506,
7,
491,
74,
368,
73
],
"text": "jumps"
},
{
"boundingBox": [
492,
7,
607,
8,
592,
75,
478,
74
],
"text": "over"
},
{
"boundingBox": [
589,
8,
673,
8,
658,
75,
575,
75
],
"text": "the"
},
{
"boundingBox": [
660,
8,
783,
9,
768,
76,
645,
75
],
"text": "lazy"
}
]
},
{
"boundingBox": [
2,
84,
783,
96,
782,
154,
1,
148
],
"text": "$78,000.00",
"words": [
{
"boundingBox": [
0,
86,
94,
87,
72,
151,
0,
149
],
"text": "$78,000.00"
},
{
"boundingBox": [
76,
87,
164,
88,
142,
152,
54,
150
],
"text": "my"
},
{
"boundingBox": [
155,
88,
243,
89,
222,
152,
134,
151
],
"text": "box"
},
{
"boundingBox": [
226,
89,
344,
90,
323,
154,
204,
152
],
"text": "with"
},
{
"boundingBox": [
336,
90,
432,
91,
411,
154,
314,
154
],
"text": "five"
},
{
"boundingBox": [
419,
91,
538,
92,
516,
154,
398,
154
],
"text": "dozen"
},
{
"boundingBox": [
547,
92,
701,
94,
679,
154,
525,
154
],
"text": "liquor"
},
{
"boundingBox": [
696,
94,
800,
95,
780,
154,
675,
154
],
"text": "jugs"
}
]
}
]
}
}
Now my requirement is to mask all the amounts($230,$78,000.00) in the Json response with XXXXX or black out those numbers.
I am using azure notebooks to build the code in python 3.6.
Please help if anyone worked on this earlier.
Thank You!

You can't control the JSON response, but you can control what you show to your users from the response!
You can check for each text returned if it includes the $ character and remove everything that comes after it!
Here's an example:
text = 'some text$ this part will be removed.'
head, sep, tail = text.partition('$')
print(head) will show "some text"
print(sep) will show "$"
print(tail) will show " this part will be removed."

Related

How to Map the nested array using map function?

I/P:
[
{
"keyword": "outdoor cushion",
"bidInfo": [
{
"matchType": "EXACT",
"theme": "CONVERSION_OPPORTUNITIES",
"rank": 1,
"bid": 160,
"suggestedBid": {
"rangeStart": 90,
"rangeMedian": 160,
"rangeEnd": 188
}
},
{
"matchType": "PHRASE",
"theme": "CONVERSION_OPPORTUNITIES",
"rank": 1,
"bid": 104,
"suggestedBid": {
"rangeStart": 83,
"rangeMedian": 104,
"rangeEnd": 177
}
},
{
"matchType": "BROAD",
"theme": "CONVERSION_OPPORTUNITIES",
"rank": 1,
"bid": 106,
"suggestedBid": {
"rangeStart": 87,
"rangeMedian": 106,
"rangeEnd": 139
}
}
],
"translation": "アウトドアクッション",
"userSelectedKeyword": false,
"searchTermImpressionRank": 18,
"searchTermImpressionShare": 0.7346906874047442,
"recId": "e4918800-d0bf-4f20-8776-68574e4c8339"
},
{
"keyword": "cushion",
"bidInfo": [
{
"matchType": "EXACT",
"theme": "CONVERSION_OPPORTUNITIES",
"rank": 2,
"bid": 140,
"suggestedBid": {
"rangeStart": 140,
"rangeMedian": 140,
"rangeEnd": 140
}
},
{
"matchType": "BROAD",
"theme": "CONVERSION_OPPORTUNITIES",
"rank": 2,
"bid": 111,
"suggestedBid": {
"rangeStart": 79,
"rangeMedian": 111,
"rangeEnd": 155
}
},
{
"matchType": "PHRASE",
"theme": "CONVERSION_OPPORTUNITIES",
"rank": 2,
"bid": 107,
"suggestedBid": {
"rangeStart": 76,
"rangeMedian": 107,
"rangeEnd": 149
}
}
],
"translation": "クッション",
"userSelectedKeyword": false,
"searchTermImpressionRank": 60,
"searchTermImpressionShare": 0.04246088993932697,
"recId": "50a5705a-d79a-4f74-82bd-2836096b36fd"
}
]
trying:
const suggestionSp: { Rank: string; searchTerm: string; matchType: string; low: string; suggestedBid: string; high: string; translation: string; }[] = [];
if (suggestions.keywordTargetList.length > 0) {
await suggestions.keywordTargetList.map((item: { rank: string; keyword: string; matchType: string; suggestedBid: any; bid: any; bidInfo: any[]; translation: any; }) => {
suggestionSp.push({Rank: item.bidInfo.rank, searchTerm: item.keyword, matchType: item.bidInfo.matchType, low: item.bidInfo.suggestedBid.rangeStart, suggestedBid: item.bidInfo.suggestedBid.rangeMedian, high: item.bidInfo.suggestedBid.rangeEnd, translation: item.translation });
});
}
Expected O/P:
[{
"keyword": "outdoor cushion",
"matchType": "EXACT",
"rank": 1,
"rangeStart": 92,
"rangeMedian": 163,
"rangeEnd": 191,
"translation": "アウトドアクッション"
},
{
"keyword": "outdoor cushion",
"matchType": "PHRASE",
"rank": 1,
"rangeStart": 83,
"rangeMedian": 103,
"rangeEnd": 176,
"translation": "アウトドアクッション"
},
{
"matchType": "BROAD",
"rank": 1,
"rangeStart": 88,
"rangeMedian": 107,
"rangeEnd": 141,
"translation": "アウトドアクッション"
}
]

You can use the array's reduce method:
const output = input.reduce((result, current) => result.concat(
current.bidInfo.map(bidInfo => ({
"keyword": current.keyword,
"matchType": bidInfo.matchType,
"rank": bidInfo.rank,
"rangeStart": bidInfo.suggestedBid.rangeStart,
"rangeMedian": bidInfo.suggestedBid.rangeMedian,
"rangeEnd": bidInfo.suggestedBid.rangeEnd,
"translation": current.translation
})
)), []);
Used Reduce syntax:
reduce((accumulator, currentValue) => { /* … */ } , initialValue)

Get 9 most common RBG values from table of RGB values

Quick question.
I've got a table as such:
[
[ 86, 81, 89 ], [ 86, 81, 89 ], [ 86, 81, 89 ], [ 83, 78, 84 ],
[ 86, 81, 89 ], [ 83, 78, 84 ], [ 86, 81, 89 ], [ 86, 81, 89 ],
[ 83, 78, 84 ], [ 83, 78, 84 ], [ 80, 74, 80 ], [ 76, 69, 76 ],
[ 76, 69, 76 ], [ 76, 69, 76 ], [ 73, 65, 71 ], [ 67, 63, 70 ],
[ 67, 63, 70 ], [ 68, 60, 64 ], [ 66, 56, 58 ], [ 57, 51, 59 ],
[ 66, 56, 58 ], [ 66, 56, 58 ], [ 68, 60, 64 ], [ 68, 60, 64 ],
[ 68, 60, 64 ], [ 67, 63, 70 ], [ 73, 65, 71 ], [ 76, 69, 76 ],
[ 76, 69, 76 ], [ 76, 69, 76 ], [ 76, 69, 76 ], [ 73, 65, 71 ],
[ 73, 65, 71 ], [ 68, 60, 64 ], [ 68, 60, 64 ], [ 68, 60, 64 ],
[ 66, 56, 58 ], [ 57, 51, 59 ], [ 63, 52, 51 ], [ 63, 52, 51 ],
[ 56, 47, 48 ], [ 56, 47, 48 ], [ 57, 51, 59 ], [ 63, 52, 51 ],
[ 57, 51, 59 ], [ 66, 56, 58 ], [ 66, 56, 58 ], [ 68, 60, 64 ],
[ 66, 56, 58 ], [ 57, 51, 59 ], [ 66, 56, 58 ], [ 68, 60, 64 ],
[ 67, 63, 70 ], [ 76, 69, 76 ], [ 83, 78, 84 ], [ 86, 81, 89 ],
[ 86, 81, 89 ], [ 93, 87, 96 ], [ 100, 96, 105 ], [ 109, 104, 108 ],
[ 115, 108, 112 ], [ 119, 114, 120 ], [ 128, 122, 125 ], [ 134, 130, 137 ],
[ 134, 130, 137 ], [ 127, 125, 135 ], [ 134, 130, 137 ], [ 134, 130, 137 ],
[ 134, 130, 137 ], [ 134, 130, 137 ], [ 134, 130, 137 ], [ 134, 130, 137 ],
[ 134, 130, 137 ], [ 134, 130, 137 ], [ 134, 130, 137 ], [ 135, 133, 143 ],
[ 143, 140, 145 ], [ 146, 143, 149 ], [ 149, 146, 152 ], [ 149, 146, 152 ],
[ 149, 146, 152 ], [ 152, 145, 147 ], [ 149, 146, 152 ], [ 149, 143, 144 ],
[ 146, 143, 149 ], [ 143, 140, 145 ], [ 143, 140, 145 ], [ 141, 137, 141 ],
[ 149, 146, 152 ], [ 157, 151, 154 ], [ 162, 160, 165 ], [ 168, 166, 171 ],
[ 174, 169, 172 ], [ 174, 169, 172 ], [ 168, 166, 171 ], [ 169, 164, 165 ],
[ 166, 160, 161 ], [ 160, 154, 155 ], [ 155, 148, 149 ], [ 149, 143, 144 ],
... 50525 more items
]
How would i get the 9 most common RGB values in that there table?
Thanks in advance.

Here's one way to do it. You would need to use your own, full array here. I used only the values you supplied. I am also not returning only the 9 most common rgb values, but all of them in order from most common to least common, so you would have to take the first 9 elements from the sortableRankings-array to get exactly the 9 most common rgbs.
const rgbs = [
[ 86, 81, 89 ], [ 86, 81, 89 ], [ 86, 81, 89 ], [ 83, 78, 84 ],
[ 86, 81, 89 ], [ 83, 78, 84 ], [ 86, 81, 89 ], [ 86, 81, 89 ],
[ 83, 78, 84 ], [ 83, 78, 84 ], [ 80, 74, 80 ], [ 76, 69, 76 ],
[ 76, 69, 76 ], [ 76, 69, 76 ], [ 73, 65, 71 ], [ 67, 63, 70 ],
[ 67, 63, 70 ], [ 68, 60, 64 ], [ 66, 56, 58 ], [ 57, 51, 59 ],
[ 66, 56, 58 ], [ 66, 56, 58 ], [ 68, 60, 64 ], [ 68, 60, 64 ],
[ 68, 60, 64 ], [ 67, 63, 70 ], [ 73, 65, 71 ], [ 76, 69, 76 ],
[ 76, 69, 76 ], [ 76, 69, 76 ], [ 76, 69, 76 ], [ 73, 65, 71 ],
[ 73, 65, 71 ], [ 68, 60, 64 ], [ 68, 60, 64 ], [ 68, 60, 64 ],
[ 66, 56, 58 ], [ 57, 51, 59 ], [ 63, 52, 51 ], [ 63, 52, 51 ],
[ 56, 47, 48 ], [ 56, 47, 48 ], [ 57, 51, 59 ], [ 63, 52, 51 ],
[ 57, 51, 59 ], [ 66, 56, 58 ], [ 66, 56, 58 ], [ 68, 60, 64 ],
[ 66, 56, 58 ], [ 57, 51, 59 ], [ 66, 56, 58 ], [ 68, 60, 64 ],
[ 67, 63, 70 ], [ 76, 69, 76 ], [ 83, 78, 84 ], [ 86, 81, 89 ],
[ 86, 81, 89 ], [ 93, 87, 96 ], [ 100, 96, 105 ], [ 109, 104, 108 ],
[ 115, 108, 112 ], [ 119, 114, 120 ], [ 128, 122, 125 ], [ 134, 130, 137 ],
[ 134, 130, 137 ], [ 127, 125, 135 ], [ 134, 130, 137 ], [ 134, 130, 137 ],
[ 134, 130, 137 ], [ 134, 130, 137 ], [ 134, 130, 137 ], [ 134, 130, 137 ],
[ 134, 130, 137 ], [ 134, 130, 137 ], [ 134, 130, 137 ], [ 135, 133, 143 ],
[ 143, 140, 145 ], [ 146, 143, 149 ], [ 149, 146, 152 ], [ 149, 146, 152 ],
[ 149, 146, 152 ], [ 152, 145, 147 ], [ 149, 146, 152 ], [ 149, 143, 144 ],
[ 146, 143, 149 ], [ 143, 140, 145 ], [ 143, 140, 145 ], [ 141, 137, 141 ],
[ 149, 146, 152 ], [ 157, 151, 154 ], [ 162, 160, 165 ], [ 168, 166, 171 ],
[ 174, 169, 172 ], [ 174, 169, 172 ], [ 168, 166, 171 ], [ 169, 164, 165 ],
[ 166, 160, 161 ], [ 160, 154, 155 ], [ 155, 148, 149 ], [ 149, 143, 144 ]
]
// Something to save the rankings in
const rankings = {};
// Loop through the rgb values
for(let i = 0; i < rgbs.length; i++)
{
const rgb = rgbs[i];
const rgbString = rgb.join(','); // Make a "key" from the rgb triplet
// Store the ranking, or increment it by one if it already exists
if(!(rgbString in rankings))
rankings[rgbString] = 1;
else
rankings[rgbString]++;
}
// Can't easily sort rankings object and we need to use an object in the previous step to easily find existing rgb rankings by key,
// So now turn that object into an array which we can sort by the ranking
const sortableRankings = [];
for(let ranking in rankings)
{
sortableRankings.push([ranking, rankings[ranking]]);
}
// Sort by ranking from highest to lowest and log it out
console.log(sortableRankings.sort((a, b) => b[1] - a[1]));

Create pandas dataframe based on row search substring value

I am struggling to shape my Python data into a dataframe. Can anyone help me with the code that might get me there? It seems the easiest solution would be to create columns based on substrings of text from the rows but I cannot find documentation to get me the shape I am seeking from the rows.
Original Dataframe - no column headers, data all in rows
Desired Dataframe - bounding box rows to columns with uniform header, confidence to column
My response is structured as follows:
{
"status": "succeeded",
"createdDateTime": "2020-08-28T19:21:29Z",
"lastUpdatedDateTime": "2020-08-28T19:21:31Z",
"analyzeResult": {
"version": "3.0.0",
"readResults": [{
"page": 1,
"angle": 0.1296,
"width": 1700,
"height": 2200,
"unit": "pixel",
"lines": [{
"boundingBox": [
182,
119,
383,
119,
383,
161,
182,
160
],
"text": "FORM 101",
"words": [{
"boundingBox": [
183,
120,
305,
120,
305,
161,
182,
161
],
"text": "FORM",
"confidence": 0.987
},
{
"boundingBox": [
318,
120,
381,
120,
382,
162,
318,
161
],
"text": "101",
"confidence": 0.987
}
]
},
{
"boundingBox": [
578,
129,
1121,
129,
1121,
163,
578,
162
],
"text": "The Commonwealth of Massachusetts",
"words": [{
"boundingBox": [
579,
129,
634,
129,
634,
162,
579,
161
],
"text": "The",
"confidence": 0.988
},
{
"boundingBox": [
641,
129,
868,
129,
866,
164,
640,
162
],
"text": "Commonwealth",
"confidence": 0.979
},
{
"boundingBox": [
874,
129,
902,
129,
900,
164,
872,
164
],
"text": "of",
"confidence": 0.988
},
{
"boundingBox": [
908,
129,
1120,
130,
1117,
163,
906,
164
],
"text": "Massachusetts",
"confidence": 0.977
}
]
},
{
"boundingBox": [
1341,
137,
1540,
138,
1540,
164,
1341,
163
],
"text": "DIA USE ONLY",
"words": [{
"boundingBox": [
1342,
138,
1392,
138,
1392,
164,
1341,
163
],
"text": "DIA",
"confidence": 0.983
},
{
"boundingBox": [
1397,
138,
1452,
139,
1452,
164,
1397,
164
],
"text": "USE",
"confidence": 0.983
},
{
"boundingBox": [
1457,
139,
1539,
138,
1540,
164,
1457,
164
],
"text": "ONLY",
"confidence": 0.986
}
]
},
{
"boundingBox": [
459,
169,
1235,
168,
1235,
202,
459,
203
],
"text": "Department of Industrial Accidents - Department 101",
"words": [{
"boundingBox": [
460,
170,
634,
170,
634,
203,
460,
204
],
"text": "Department",
"confidence": 0.981
},
{
"boundingBox": [
640,
170,
669,
170,
669,
203,
640,
203
],
"text": "of",
"confidence": 0.983
},
{
"boundingBox": [
676,
170,
821,
169,
821,
203,
676,
203
],
"text": "Industrial",
"confidence": 0.981
},
{
"boundingBox": [
828,
169,
967,
169,
966,
203,
828,
203
],
"text": "Accidents",
"confidence": 0.952
},
{
"boundingBox": [
973,
169,
993,
169,
993,
203,
973,
203
],
"text": "-",
"confidence": 0.983
},
{
"boundingBox": [
1000,
169,
1176,
169,
1176,
203,
999,
203
],
"text": "Department",
"confidence": 0.982
},
{
"boundingBox": [
1183,
169,
1236,
169,
1235,
203,
1182,
203
],
"text": "101",
"confidence": 0.987
}
]
},
{
"boundingBox": [
511,
205,
1189,
205,
1189,
233,
511,
234
],
"text": "1 Congress Street, Suite 100, Boston, Massachusetts 02114-2017",
"words": [{
"boundingBox": [
513,
206,
520,
206,
519,
233,
512,
233
],
"text": "1",
"confidence": 0.974
},
{
"boundingBox": [
525,
206,
625,
206,
624,
234,
524,
233
],
"text": "Congress",
"confidence": 0.981
},
{
"boundingBox": [
630,
206,
702,
206,
701,
234,
629,
234
],
"text": "Street,",
"confidence": 0.977
},
{
"boundingBox": [
707,
206,
763,
206,
762,
234,
706,
234
],
"text": "Suite",
"confidence": 0.983
},
{
"boundingBox": [
769,
206,
812,
206,
811,
234,
767,
234
],
"text": "100,",
"confidence": 0.983
},
{
"boundingBox": [
818,
206,
898,
206,
897,
234,
816,
234
],
"text": "Boston,",
"confidence": 0.983
},
{
"boundingBox": [
903,
206,
1059,
205,
1058,
234,
902,
234
],
"text": "Massachusetts",
"confidence": 0.975
},
{
"boundingBox": [
1064,
205,
1189,
205,
1187,
233,
1063,
234
],
"text": "02114-2017",
"confidence": 0.978
}
]
},
{
"boundingBox": [
422,
236,
1279,
237,
1279,
263,
422,
263
],
"text": "Info. Line 800-323-3249 ext. 470 in Mass. Outside Mass. - 617-727-4900 ext. 470",
"words": [{
"boundingBox": [
423,
237,
472,
237,
472,
263,
422,
263
],
"text": "Info.",
"confidence": 0.983
},
{
"boundingBox": [
477,
237,
526,
237,
526,
264,
477,
264
],
"text": "Line",
"confidence": 0.986
},
{
"boundingBox": [
531,
237,
674,
237,
674,
264,
531,
264
],
"text": "800-323-3249",
"confidence": 0.977
},
{
"boundingBox": [
679,
237,
718,
237,
718,
264,
679,
264
],
"text": "ext.",
"confidence": 0.982
},
{
"boundingBox": [
724,
237,
763,
237,
763,
264,
723,
264
],
"text": "470",
"confidence": 0.986
},
{
"boundingBox": [
768,
237,
790,
237,
790,
264,
768,
264
],
"text": "in",
"confidence": 0.987
},
{
"boundingBox": [
795,
237,
865,
237,
865,
264,
795,
264
],
"text": "Mass.",
"confidence": 0.983
},
{
"boundingBox": [
870,
237,
953,
237,
953,
264,
870,
264
],
"text": "Outside",
"confidence": 0.981
},
{
"boundingBox": [
958,
237,
1019,
237,
1020,
264,
958,
264
],
"text": "Mass.",
"confidence": 0.984
},
{
"boundingBox": [
1025,
237,
1036,
237,
1037,
264,
1025,
264
],
"text": "-",
"confidence": 0.983
},
{
"boundingBox": [
1042,
237,
1184,
237,
1185,
264,
1042,
264
],
"text": "617-727-4900",
"confidence": 0.975
},
{
"boundingBox": [
1190,
237,
1229,
238,
1229,
264,
1190,
264
],
"text": "ext.",
"confidence": 0.985
},
{
"boundingBox": [
1234,
238,
1278,
238,
1278,
264,
1234,
264
],
"text": "470",
"confidence": 0.983
}
]
},
{
"boundingBox": [
716,
264,
984,
266,
984,
293,
715,
292
],
"text": "http://www.mass.gov/dia",
"words": [{
"boundingBox": [
717,
265,
985,
267,
984,
294,
716,
293
],
"text": "http://www.mass.gov/dia",
"confidence": 0.952
}]
},
{
"boundingBox": [
398,
299,
1289,
299,
1289,
342,
398,
342
],
"text": "EMPLOYER'S FIRST REPORT OF INJURY",
"words": [{
"boundingBox": [
399,
300,
693,
300,
693,
341,
399,
343
],
"text": "EMPLOYER'S",
"confidence": 0.98
},
{
"boundingBox": [
702,
300,
836,
300,
836,
341,
702,
341
],
"text": "FIRST",
"confidence": 0.982
},
{
"boundingBox": [
845,
300,
1036,
300,
1036,
341,
844,
341
],
"text": "REPORT",
"confidence": 0.985
},
{
"boundingBox": [
1045,
300,
1105,
300,
1104,
342,
1044,
341
],
"text": "OF",
"confidence": 0.988
},
{
"boundingBox": [
1113,
300,
1288,
299,
1287,
343,
1113,
342
],
"text": "INJURY",
"confidence": 0.986
}
]
},
{
"boundingBox": [
691,
354,
1005,
355,
1005,
395,
691,
393
],
"text": "OR FATALITY",
"words": [{
"boundingBox": [
691,
354,
760,
355,
760,
395,
692,
394
],
"text": "OR",
"confidence": 0.988
},
{
"boundingBox": [
768,
355,
1005,
356,
1003,
395,
768,
395
],
"text": "FATALITY",
"confidence": 0.981
}
]
}
]
}]
}
}

Without supplying your data or an explanation this mostly does what you want.
comments explain approach
there is more work to be done on linekey however I cannot see the relationship between the actual data and the outcome you posted as an image
import re
import numpy as np
import pandas as pd
df = pd.DataFrame(
{0:["analyzeResult_readResults_0_lines_0_text","analyzeResult_readResults_0_lines_0_words_0_boundingBox_0","analyzeResult_readResults_0_lines_0_words_0_boundingBox_1","analyzeResult_readResults_0_lines_0_words_0_boundingBox_2","analyzeResult_readResults_0_lines_0_words_0_boundingBox_3","analyzeResult_readResults_0_lines_0_words_0_boundingBox_4","analyzeResult_readResults_0_lines_0_words_0_boundingBox_5","analyzeResult_readResults_0_lines_0_words_0_boundingBox_6","analyzeResult_readResults_0_lines_0_words_0_boundingBox_7","analyzeResult_readResults_0_lines_0_words_0_text","analyzeResult_readResults_0_lines_0_words_0_confidence","analyzeResult_readResults_0_lines_0_words_1_boundingBox_0","analyzeResult_readResults_0_lines_0_words_1_boundingBox_1","analyzeResult_readResults_0_lines_0_words_1_boundingBox_2","analyzeResult_readResults_0_lines_0_words_1_boundingBox_3","analyzeResult_readResults_0_lines_0_words_1_boundingBox_4","analyzeResult_readResults_0_lines_0_words_1_boundingBox_5","analyzeResult_readResults_0_lines_0_words_1_boundingBox_6","analyzeResult_readResults_0_lines_0_words_1_boundingBox_7","analyzeResult_readResults_0_lines_0_words_1_text","analyzeResult_readResults_0_lines_0_words_1_confidence","analyzeResult_readResults_0_lines_1_boundingBox_0","analyzeResult_readResults_0_lines_1_boundingBox_1","analyzeResult_readResults_0_lines_1_boundingBox_2","analyzeResult_readResults_0_lines_1_boundingBox_3","analyzeResult_readResults_0_lines_1_boundingBox_4","analyzeResult_readResults_0_lines_1_boundingBox_5","analyzeResult_readResults_0_lines_1_boundingBox_6","analyzeResult_readResults_0_lines_1_boundingBox_7"],
1:["FORM 101",183,120,305,120,305,161,182,161,"FORM",0.987,318,120,381,120,382,162,318,161,101,0.987,578,129,1121,129,1121,163,578,162],
},
index=[17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45]
)
df = (
df
.rename(columns={0:"key",1:"val"})
.assign(
b=lambda x: x["key"].str.extract("(.*)_bounding"),
c=lambda x: x["key"].str.extract("(.*)_confidence"),
# linekey is everything before "_bounding" or "_confidence". pull the two together
linekey=lambda x: np.where(x["b"].isna(),
np.where(x["c"].isna(), x["key"], x["c"]),
x["b"]),
# column key is every thing after line key minus leading "_"
colkey=lambda x: x.apply(lambda r: r["key"].replace(r["linekey"], "").strip("_"), axis=1)
)
.assign(
# cleanup special case line keys...
colkey=lambda x: np.where(x["colkey"]=="", "Value", x["colkey"].replace("confidence","Confidence"))
)
# remove working columns
.drop(columns=["b","c","key"])
# mixed values and strings so use "first" and unstack to get to desired layout
.groupby(["linekey","colkey"]).agg({"val":"first"}).unstack()
)
print(df.to_string())
output
val
colkey Confidence Value boundingBox_0 boundingBox_1 boundingBox_2 boundingBox_3 boundingBox_4 boundingBox_5 boundingBox_6 boundingBox_7
linekey
analyzeResult_readResults_0_lines_0_text NaN FORM 101 NaN NaN NaN NaN NaN NaN NaN NaN
analyzeResult_readResults_0_lines_0_words_0 0.987 NaN 183 120 305 120 305 161 182 161
analyzeResult_readResults_0_lines_0_words_0_text NaN FORM NaN NaN NaN NaN NaN NaN NaN NaN
analyzeResult_readResults_0_lines_0_words_1 0.987 NaN 318 120 381 120 382 162 318 161
analyzeResult_readResults_0_lines_0_words_1_text NaN 101 NaN NaN NaN NaN NaN NaN NaN NaN
analyzeResult_readResults_0_lines_1 NaN NaN 578 129 1121 129 1121 163 578 162

How to read tabular format data image and store into csv file using custom vision api?

I have written python code to read text from image and store it into text files.
I'm able to capture the normal data from image but finding difficult to capture tabular format data from image and store into csv file. I'm able to read one column from image but other columns are missing in output. Is it possible to achieve the solution through custom vision api?

Sam, that's not the purpose of Custom Vision, which has 2 capabilities: classification or object detection.
Here you want to parse a form in an image, there a product for that called Form Recognizer API, described here
Have look to the preview samples in the page:
I made a quick test using your image with the Analyse Layout operation from Form Recognizer (see doc here), here is the output:
{
"status": "succeeded",
"createdDateTime": "2020-04-16T17:31:52Z",
"lastUpdatedDateTime": "2020-04-16T17:31:58Z",
"analyzeResult": {
"version": "2.0.0",
"readResults": [{
"page": 1,
"language": "en",
"angle": 0,
"width": 467,
"height": 113,
"unit": "pixel",
"lines": [{
"language": "en",
"boundingBox": [4, 6, 17, 5, 17, 15, 4, 16],
"text": "#",
"words": [{
"boundingBox": [6, 6, 13, 5, 14, 15, 7, 16],
"text": "#",
"confidence": 0.875
}]
}, {
"language": "en",
"boundingBox": [26, 6, 49, 6, 49, 15, 26, 15],
"text": "Item",
"words": [{
"boundingBox": [27, 6, 47, 6, 47, 15, 27, 15],
"text": "Item",
"confidence": 0.683
}]
}, {
"language": "en",
"boundingBox": [273, 5, 315, 4, 315, 16, 273, 16],
"text": "QTY/HR",
"words": [{
"boundingBox": [274, 5, 313, 5, 314, 16, 274, 16],
"text": "QTY/HR",
"confidence": 0.947
}]
}, {
"language": "en",
"boundingBox": [330, 5, 386, 6, 386, 17, 330, 16],
"text": "Unit price",
"words": [{
"boundingBox": [334, 6, 356, 6, 357, 17, 334, 17],
"text": "Unit",
"confidence": 0.959
}, {
"boundingBox": [359, 6, 385, 6, 385, 18, 359, 17],
"text": "price",
"confidence": 0.959
}]
}, {
"language": "en",
"boundingBox": [419, 6, 461, 6, 461, 16, 419, 16],
"text": "Amount",
"words": [{
"boundingBox": [420, 6, 461, 7, 461, 17, 420, 17],
"text": "Amount",
"confidence": 0.959
}]
}, {
"language": "en",
"boundingBox": [23, 20, 182, 20, 182, 32, 23, 32],
"text": "Installed office furniture (hours)",
"words": [{
"boundingBox": [26, 21, 68, 21, 68, 32, 26, 32],
"text": "Installed",
"confidence": 0.862
}, {
"boundingBox": [70, 21, 98, 21, 98, 32, 70, 32],
"text": "office",
"confidence": 0.958
}, {
"boundingBox": [100, 21, 145, 21, 145, 32, 100, 32],
"text": "furniture",
"confidence": 0.958
}, {
"boundingBox": [147, 21, 182, 21, 183, 33, 147, 32],
"text": "(hours)",
"confidence": 0.914
}]
}, {
"language": "en",
"boundingBox": [305, 22, 314, 22, 313, 31, 304, 32],
"text": "3",
"words": [{
"boundingBox": [308, 22, 313, 22, 314, 31, 308, 32],
"text": "3",
"confidence": 0.891
}]
}, {
"language": "en",
"boundingBox": [364, 21, 384, 21, 385, 30, 364, 31],
"text": "150",
"words": [{
"boundingBox": [366, 21, 384, 21, 385, 31, 366, 31],
"text": "150",
"confidence": 0.958
}]
}, {
"language": "en",
"boundingBox": [443, 21, 459, 21, 459, 30, 443, 31],
"text": "450",
"words": [{
"boundingBox": [443, 21, 459, 21, 459, 30, 443, 31],
"text": "450",
"confidence": 0.694
}]
}, {
"language": "en",
"boundingBox": [4, 37, 15, 37, 14, 47, 4, 47],
"text": "2",
"words": [{
"boundingBox": [7, 37, 14, 37, 14, 47, 7, 47],
"text": "2",
"confidence": 0.891
}]
}, {
"language": "en",
"boundingBox": [26, 36, 131, 37, 131, 48, 26, 47],
"text": "Herman Miller Aeron",
"words": [{
"boundingBox": [27, 37, 66, 37, 66, 47, 27, 48],
"text": "Herman",
"confidence": 0.959
}, {
"boundingBox": [69, 37, 99, 37, 99, 48, 69, 47],
"text": "Miller",
"confidence": 0.959
}, {
"boundingBox": [101, 37, 131, 38, 130, 48, 101, 48],
"text": "Aeron",
"confidence": 0.958
}]
}, {
"language": "en",
"boundingBox": [307, 37, 316, 38, 314, 48, 306, 48],
"text": "4",
"words": [{
"boundingBox": [308, 37, 315, 37, 315, 48, 307, 47],
"text": "4",
"confidence": 0.895
}]
}, {
"language": "en",
"boundingBox": [366, 37, 384, 37, 384, 47, 366, 46],
"text": "900",
"words": [{
"boundingBox": [366, 37, 384, 37, 384, 47, 366, 46],
"text": "900",
"confidence": 0.950
}]
}, {
"language": "en",
"boundingBox": [436, 37, 460, 36, 460, 46, 436, 47],
"text": "3600",
"words": [{
"boundingBox": [436, 37, 460, 36, 460, 46, 436, 47],
"text": "3600",
"confidence": 0.890
}]
}, {
"language": "en",
"boundingBox": [26, 52, 100, 53, 100, 63, 26, 62],
"text": "Sonos speakers",
"words": [{
"boundingBox": [27, 53, 56, 53, 56, 62, 27, 62],
"text": "Sonos",
"confidence": 0.959
}, {
"boundingBox": [58, 53, 100, 54, 100, 63, 58, 62],
"text": "speakers",
"confidence": 0.959
}]
}, {
"language": "en",
"boundingBox": [304, 52, 316, 52, 315, 62, 303, 62],
"text": "3",
"words": [{
"boundingBox": [307, 52, 314, 52, 314, 62, 307, 62],
"text": "3",
"confidence": 0.886
}]
}, {
"language": "en",
"boundingBox": [365, 51, 385, 51, 384, 62, 365, 62],
"text": "320",
"words": [{
"boundingBox": [365, 51, 385, 51, 385, 62, 365, 62],
"text": "320",
"confidence": 0.928
}]
}, {
"language": "en",
"boundingBox": [444, 52, 455, 52, 455, 61, 444, 61],
"text": "96",
"words": [{
"boundingBox": [444, 52, 454, 52, 454, 61, 444, 61],
"text": "96",
"confidence": 0.570
}]
}, {
"language": "en",
"boundingBox": [27, 67, 138, 67, 138, 79, 27, 79],
"text": "Giardino Grande Table",
"words": [{
"boundingBox": [28, 68, 69, 68, 69, 80, 28, 79],
"text": "Giardino",
"confidence": 0.861
}, {
"boundingBox": [71, 68, 109, 68, 109, 80, 71, 80],
"text": "Grande",
"confidence": 0.959
}, {
"boundingBox": [111, 68, 138, 67, 137, 80, 111, 80],
"text": "Table",
"confidence": 0.958
}]
}, {
"language": "en",
"boundingBox": [303, 68, 315, 66, 317, 76, 305, 78],
"text": "1",
"words": [{
"boundingBox": [308, 67, 314, 66, 316, 76, 309, 77],
"text": "1",
"confidence": 0.839
}]
}, {
"language": "en",
"boundingBox": [366, 67, 383, 68, 383, 77, 366, 77],
"text": "780",
"words": [{
"boundingBox": [366, 67, 383, 67, 382, 77, 366, 76],
"text": "780",
"confidence": 0.909
}]
}, {
"language": "en",
"boundingBox": [442, 68, 460, 67, 460, 77, 442, 77],
"text": "780",
"words": [{
"boundingBox": [442, 67, 460, 67, 460, 76, 442, 77],
"text": "780",
"confidence": 0.958
}]
}]
}],
"pageResults": [{
"page": 1,
"tables": [{
"rows": 4,
"columns": 4,
"cells": [{
"rowIndex": 0,
"columnIndex": 1,
"text": "Installed office furniture (hours)",
"boundingBox": [26, 21, 274, 21, 274, 34, 26, 34],
"elements": ["#/readResults/0/lines/5/words/0", "#/readResults/0/lines/5/words/1", "#/readResults/0/lines/5/words/2", "#/readResults/0/lines/5/words/3"]
}, {
"rowIndex": 0,
"columnIndex": 2,
"text": "3",
"boundingBox": [274, 21, 334, 21, 334, 34, 274, 34],
"elements": ["#/readResults/0/lines/6/words/0"]
}, {
"rowIndex": 0,
"columnIndex": 3,
"text": "150",
"boundingBox": [334, 21, 385, 21, 385, 34, 334, 34],
"elements": ["#/readResults/0/lines/7/words/0"]
}, {
"rowIndex": 1,
"columnIndex": 0,
"text": "2",
"boundingBox": [7, 34, 26, 34, 26, 50, 7, 50],
"elements": ["#/readResults/0/lines/9/words/0"]
}, {
"rowIndex": 1,
"columnIndex": 1,
"text": "Herman Miller Aeron",
"boundingBox": [26, 34, 274, 34, 274, 50, 26, 50],
"elements": ["#/readResults/0/lines/10/words/0", "#/readResults/0/lines/10/words/1", "#/readResults/0/lines/10/words/2"]
}, {
"rowIndex": 1,
"columnIndex": 2,
"text": "4",
"boundingBox": [274, 34, 334, 34, 334, 50, 274, 50],
"elements": ["#/readResults/0/lines/11/words/0"]
}, {
"rowIndex": 1,
"columnIndex": 3,
"text": "900",
"boundingBox": [334, 34, 385, 34, 385, 50, 334, 50],
"elements": ["#/readResults/0/lines/12/words/0"]
}, {
"rowIndex": 2,
"columnIndex": 1,
"text": "Sonos speakers",
"boundingBox": [26, 50, 274, 50, 274, 65, 26, 65],
"elements": ["#/readResults/0/lines/14/words/0", "#/readResults/0/lines/14/words/1"]
}, {
"rowIndex": 2,
"columnIndex": 2,
"text": "3",
"boundingBox": [274, 50, 334, 50, 334, 65, 274, 65],
"elements": ["#/readResults/0/lines/15/words/0"]
}, {
"rowIndex": 2,
"columnIndex": 3,
"text": "320",
"boundingBox": [334, 50, 385, 50, 385, 65, 334, 65],
"elements": ["#/readResults/0/lines/16/words/0"]
}, {
"rowIndex": 3,
"columnIndex": 1,
"text": "Giardino Grande Table",
"boundingBox": [26, 65, 274, 65, 274, 79, 26, 79],
"elements": ["#/readResults/0/lines/18/words/0", "#/readResults/0/lines/18/words/1", "#/readResults/0/lines/18/words/2"]
}, {
"rowIndex": 3,
"columnIndex": 2,
"text": "1",
"boundingBox": [274, 65, 334, 65, 334, 79, 274, 79],
"elements": ["#/readResults/0/lines/19/words/0"]
}, {
"rowIndex": 3,
"columnIndex": 3,
"text": "780",
"boundingBox": [334, 65, 385, 65, 385, 79, 334, 79],
"elements": ["#/readResults/0/lines/20/words/0"]
}]
}]
}]
}
}
As you can see, you have the array details in the output, looks like interesting for you!

Computer Vision REST API format

I am currently using the trial version of the Computer Vision API in java, So I acquired the code from the website and successfully got the JSON.
However, the format of the JSON I got was quite different than the one showed in the Demo page.
Example my Json response:
"regions": [
{
"boundingBox": "21,16,304,451",
"lines": [
{
"boundingBox": "28,16,288,41",
"words": [
{
"boundingBox": "28,16,288,41",
"text": "NOTHING"
}
]
},
Whereas the demo page is:
{
"lines": [
{
"boundingBox": [
122,
122,
401,
85,
404,
229,
143,
233
],
looking at the bounding box format, we can clearly see the difference

The response you get is the result of using the Computer Vision API's OCR as the example states:
{
"language": "en",
"textAngle": -2.0000000000000338,
"orientation": "Up",
"regions": [
{
"boundingBox": "462,379,497,258",
"lines": [
{
"boundingBox": "462,379,497,74",
"words": [
{
"boundingBox": "462,379,41,73",
"text": "A"
},
{
"boundingBox": "523,379,153,73",
"text": "GOAL"
},
{
"boundingBox": "694,379,265,74",
"text": "WITHOUT"
}
]
},
{
"boundingBox": "565,471,289,74",
"words": [
{
"boundingBox": "565,471,41,73",
"text": "A"
},
{
"boundingBox": "626,471,150,73",
"text": "PLAN"
},
{
"boundingBox": "801,472,53,73",
"text": "IS"
}
]
},
{
"boundingBox": "519,563,375,74",
"words": [
{
"boundingBox": "519,563,149,74",
"text": "JUST"
},
{
"boundingBox": "683,564,41,72",
"text": "A"
},
{
"boundingBox": "741,564,153,73",
"text": "WISH"
}
]
}
]
}
]
}
While the response from the demo page is the result of using the Computer Vision API's Recognize Text then Get Recognize Text Operation Result to get the result of the operation as the example states:
{
"status": "Succeeded",
"recognitionResult": {
"lines": [
{
"boundingBox": [
202,
618,
2047,
643,
2046,
840,
200,
813
],
"text": "Our greatest glory is not",
"words": [
{
"boundingBox": [
204,
627,
481,
628,
481,
830,
204,
829
],
"text": "Our"
},
{
"boundingBox": [
519,
628,
1057,
630,
1057,
832,
518,
830
],
"text": "greatest"
},
{
"boundingBox": [
1114,
630,
1549,
631,
1548,
833,
1114,
832
],
"text": "glory"
},
{
"boundingBox": [
1586,
631,
1785,
632,
1784,
834,
1586,
833
],
"text": "is"
},
{
"boundingBox": [
1822,
632,
2115,
633,
2115,
835,
1822,
834
],
"text": "not"
}
]
},
{
"boundingBox": [
420,
1273,
2954,
1250,
2958,
1488,
422,
1511
],
"text": "but in rising every time we fall",
"words": [
{
"boundingBox": [
423,
1269,
634,
1268,
635,
1507,
424,
1508
],
"text": "but"
},
{
"boundingBox": [
667,
1268,
808,
1268,
809,
1506,
668,
1507
],
"text": "in"
},
{
"boundingBox": [
874,
1267,
1289,
1265,
1290,
1504,
875,
1506
],
"text": "rising"
},
{
"boundingBox": [
1331,
1265,
1771,
1263,
1772,
1502,
1332,
1504
],
"text": "every"
},
{
"boundingBox": [
1812,
1263,
2178,
1261,
2179,
1500,
1813,
1502
],
"text": "time"
},
{
"boundingBox": [
2219,
1261,
2510,
1260,
2511,
1498,
2220,
1500
],
"text": "we"
},
{
"boundingBox": [
2551,
1260,
3016,
1258,
3017,
1496,
2552,
1498
],
"text": "fall"
}
]
},
{
"boundingBox": [
1612,
903,
2744,
935,
2738,
1139,
1607,
1107
],
"text": "in never failing ,",
"words": [
{
"boundingBox": [
1611,
934,
1707,
933,
1708,
1147,
1613,
1147
],
"text": "in"
},
{
"boundingBox": [
1753,
933,
2132,
930,
2133,
1144,
1754,
1146
],
"text": "never"
},
{
"boundingBox": [
2162,
930,
2673,
927,
2674,
1140,
2164,
1144
],
"text": "failing"
},
{
"boundingBox": [
2703,
926,
2788,
926,
2790,
1139,
2705,
1140
],
"text": ","
}
]
}
]
}
}

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Redact a JSON response using Python - python-3.x

Related

How to Map the nested array using map function?

Get 9 most common RBG values from table of RGB values

Create pandas dataframe based on row search substring value

How to read tabular format data image and store into csv file using custom vision api?

Computer Vision REST API format

Categories

Resources