I have this following script that will compare xml file between two folders and convert some none type argument to "".
The problem is that at certain point of file checking, it will return an error of "not all arguments converted during string formatting"
Is there a way to resolved or bypass this part.
I have this following script:
import sys
from pandas.errors import ParserError
def compare_xml_files1(location1, location2):
skip_attributes = ['href', 'id', 'ImportGUID', 'ItemSignature']
try:
with zipfile.ZipFile(location1, 'r') as location1_zip:
with zipfile.ZipFile(location2, 'r') as location2_zip:
xml_files = [f for f in location1_zip.namelist() if
f.endswith((".dita", ".ditamap", ".customerproperties"))]
flag = 0
for xml_file in xml_files:
flag1 = 0
filename = os.path.basename(xml_file)
try:
location1_xml = location1_zip.read(xml_file).decode()
# location1_xml = location1_zip.open(xml_file)
location2_xml = location2_zip.read(xml_file).decode()
# location2_xml = location2_zip.open(xml_file)
except KeyError:
LOGGER.warning(f"{filename} not found in Golden Source")
flag += 1
continue
location1_root = ET.fromstring(location1_xml)
location2_root = ET.fromstring(location2_xml)
# location1_root = ET.ElementTree(file=location1_xml)
# location2_root = ET.ElementTree(file=location2_xml)
LOGGER.info(f"Validating dita file: {filename}")
for elem1, elem2 in zip(location1_root.iter(), location2_root.iter()):
if elem1.tag == elem2.tag:
if (all(n in elem2.attrib.keys() for n in elem1.attrib.keys()) and all(
n in elem1.attrib.keys() for n in elem2.attrib.keys())):
for key in elem1.attrib:
if key in skip_attributes:
pass # skipping check for attributes list
else:
if elem1.attrib[key] == elem2.attrib[key]:
pass
else:
flag += 1
flag1 += 1
LOGGER.warning('Dita file content mismatched at attribute level: ' + key)
LOGGER.warning('Dita file content mismatched for Tag value: ' + elem1.tag)
LOGGER.warning('Source Data mismatched attribute: ' + elem1.attrib[key])
LOGGER.warning(
'Golden Data mismatched attribute: ' + elem2.attrib[key] + "\n")
else:
flag += 1
flag1 += 1
LOGGER.error('Attributes are missing at tag: ' + elem1.tag)
LOGGER.error('Attributes in Source file:')
LOGGER.error(elem1.attrib.keys())
LOGGER.error('Attributes in Golden file:')
LOGGER.error(elem2.attrib.keys())
print("\n")
if type(elem1.text) == str and date_validate(elem1.text) is False:
if elem1.text == elem2.text:
pass
else:
flag += 1
flag1 += 1
if elem2.text is None: # to convert Nonetype to empty string
elem2.text = ""
LOGGER.warning('Dita file content mismatched for Tag value: ' + elem1.tag)
# LOGGER.warning('Source Data mismatched text: ' + elem1.text)
LOGGER.warning(f'Source Data mismatched text: {elem1.text}')
# LOGGER.warning('Golden Data mismatched text: ' + elem2.text + "\n")
LOGGER.warning(f'Golden Data mismatched text: {elem2.text}\n')
else: # to convert Nonetype to empty string
flag += 1
flag1 += 1
if elem2.text is None:
elem2.text = ""
LOGGER.warning('Dita file content mismatched at Tag value')
# LOGGER.warning('Source Data mismatched text: ' + elem1.text)
LOGGER.warning(f'Source Data mismatched text: {elem1.text}')
# LOGGER.warning('Golden Data mismatched text: ' + elem2.text + "\n")
LOGGER.warning(f'Golden Data mismatched text: {elem2.text}\n')
if flag1 == 0:
LOGGER.info('%s Dita file content matched\n', filename)
print('*' * 100)
else:
LOGGER.error('%s Dita file content does not matched\n', filename)
LOGGER.info('File Location :', location1, '\n')
print('*' * 100)
if flag == 0:
return True
else:
LOGGER.error('Job has an error')
return False
except Exception as e:
LOGGER.error(str(e))
sys.exit(1)
except ParserError as e:
filename, line_no, function_name = e.__traceback__.tb_frame.f_code.co_filename, e.__traceback__.tb_lineno, \
e.__traceback__.tb_frame.f_code.co_name
error_message = e.args[0]
LOGGER.error(f'ParseError: {error_message}')
LOGGER.debug({'filename': filename, 'lineno': line_no, 'name': function_name, 'type': 'ParserError',
'message': error_message})
return False
except KeyError as e:
filename, line_no, function_name = e.__traceback__.tb_frame.f_code.co_filename, e.__traceback__.tb_lineno, \
e.__traceback__.tb_frame.f_code.co_name
error_message = e.args[0]
LOGGER.error('KeyError' + str(e))
LOGGER.debug({'filename': filename, 'lineno': line_no, 'name': function_name, 'type': 'KeyError',
'message': error_message})
return False
Related
I am new to textract and trying to get data from pdf and see in the form of key and value pair. This document has around 15 pages. So I have used this asynchronous example to get data from pdf, but I am getting this error.
Error:
botocore.errorfactory.InvalidParameterException: An error occurred (InvalidParameterException) when calling the StartDocumentAnalysis operation: Request has invalid parameters
https://docs.aws.amazon.com/textract/latest/dg/async-analyzing-with-sqs.html.
import boto3
import json
import sys
import time
class ProcessType:
DETECTION = 1
ANALYSIS = 2
class DocumentProcessor:
jobId = ''
region_name = ''
roleArn = ''
bucket = ''
document = ''
sqsQueueUrl = ''
snsTopicArn = ''
processType = ''
def __init__(self, role, bucket, document, region):
self.roleArn = role
self.bucket = bucket
self.document = document
self.region_name = region
self.textract = boto3.client('textract', region_name=self.region_name)
self.sqs = boto3.client('sqs', region_name=self.region_name)
self.sns = boto3.client('sns', region_name=self.region_name)
def ProcessDocument(self, type):
jobFound = False
self.processType = type
validType = False
# Determine which type of processing to perform
if self.processType == ProcessType.DETECTION:
response = self.textract.start_document_text_detection(
DocumentLocation={'S3Object': {'Bucket': self.bucket, 'Name': self.document}},
NotificationChannel={'RoleArn': self.roleArn, 'SNSTopicArn': self.snsTopicArn})
print('Processing type: Detection')
validType = True
# For document analysis, select which features you want to obtain with the FeatureTypes argument
if self.processType == ProcessType.ANALYSIS:
response = self.textract.start_document_analysis(
DocumentLocation={'S3Object': {'Bucket': self.bucket, 'Name': self.document}},
FeatureTypes=["TABLES", "FORMS"],
NotificationChannel={'RoleArn': self.roleArn, 'SNSTopicArn': self.snsTopicArn})
print('Processing type: Analysis')
validType = True
if validType == False:
print("Invalid processing type. Choose Detection or Analysis.")
return
print('Start Job Id: ' + response['JobId'])
dotLine = 0
while jobFound == False:
sqsResponse = self.sqs.receive_message(QueueUrl=self.sqsQueueUrl, MessageAttributeNames=['ALL'],
MaxNumberOfMessages=10)
if sqsResponse:
if 'Messages' not in sqsResponse:
if dotLine < 40:
print('.', end='')
dotLine = dotLine + 1
else:
print()
dotLine = 0
sys.stdout.flush()
time.sleep(5)
continue
for message in sqsResponse['Messages']:
notification = json.loads(message['Body'])
textMessage = json.loads(notification['Message'])
print(textMessage['JobId'])
print(textMessage['Status'])
if str(textMessage['JobId']) == response['JobId']:
print('Matching Job Found:' + textMessage['JobId'])
jobFound = True
self.GetResults(textMessage['JobId'])
self.sqs.delete_message(QueueUrl=self.sqsQueueUrl,
ReceiptHandle=message['ReceiptHandle'])
else:
print("Job didn't match:" +
str(textMessage['JobId']) + ' : ' + str(response['JobId']))
# Delete the unknown message. Consider sending to dead letter queue
self.sqs.delete_message(QueueUrl=self.sqsQueueUrl,
ReceiptHandle=message['ReceiptHandle'])
print('Done!')
def CreateTopicandQueue(self):
millis = str(int(round(time.time() * 1000)))
# Create SNS topic
snsTopicName = "AmazonTextractTopic" + millis
topicResponse = self.sns.create_topic(Name=snsTopicName)
self.snsTopicArn = topicResponse['TopicArn']
# create SQS queue
sqsQueueName = "AmazonTextractQueue" + millis
self.sqs.create_queue(QueueName=sqsQueueName)
self.sqsQueueUrl = self.sqs.get_queue_url(QueueName=sqsQueueName)['QueueUrl']
attribs = self.sqs.get_queue_attributes(QueueUrl=self.sqsQueueUrl,
AttributeNames=['QueueArn'])['Attributes']
sqsQueueArn = attribs['QueueArn']
# Subscribe SQS queue to SNS topic
self.sns.subscribe(
TopicArn=self.snsTopicArn,
Protocol='sqs',
Endpoint=sqsQueueArn)
# Authorize SNS to write SQS queue
policy = """{{
"Version":"2012-10-17",
"Statement":[
{{
"Sid":"MyPolicy",
"Effect":"Allow",
"Principal" : {{"AWS" : "*"}},
"Action":"SQS:SendMessage",
"Resource": "{}",
"Condition":{{
"ArnEquals":{{
"aws:SourceArn": "{}"
}}
}}
}}
]
}}""".format(sqsQueueArn, self.snsTopicArn)
response = self.sqs.set_queue_attributes(
QueueUrl=self.sqsQueueUrl,
Attributes={
'Policy': policy
})
def DeleteTopicandQueue(self):
self.sqs.delete_queue(QueueUrl=self.sqsQueueUrl)
self.sns.delete_topic(TopicArn=self.snsTopicArn)
# Display information about a block
def DisplayBlockInfo(self, block):
print("Block Id: " + block['Id'])
print("Type: " + block['BlockType'])
if 'EntityTypes' in block:
print('EntityTypes: {}'.format(block['EntityTypes']))
if 'Text' in block:
print("Text: " + block['Text'])
if block['BlockType'] != 'PAGE' and "Confidence" in str(block['BlockType']):
print("Confidence: " + "{:.2f}".format(block['Confidence']) + "%")
print('Page: {}'.format(block['Page']))
if block['BlockType'] == 'CELL':
print('Cell Information')
print('\tColumn: {} '.format(block['ColumnIndex']))
print('\tRow: {}'.format(block['RowIndex']))
print('\tColumn span: {} '.format(block['ColumnSpan']))
print('\tRow span: {}'.format(block['RowSpan']))
if 'Relationships' in block:
print('\tRelationships: {}'.format(block['Relationships']))
if ("Geometry") in str(block):
print('Geometry')
print('\tBounding Box: {}'.format(block['Geometry']['BoundingBox']))
print('\tPolygon: {}'.format(block['Geometry']['Polygon']))
if block['BlockType'] == 'SELECTION_ELEMENT':
print(' Selection element detected: ', end='')
if block['SelectionStatus'] == 'SELECTED':
print('Selected')
else:
print('Not selected')
if block["BlockType"] == "QUERY":
print("Query info:")
print(block["Query"])
if block["BlockType"] == "QUERY_RESULT":
print("Query answer:")
print(block["Text"])
def GetResults(self, jobId):
maxResults = 1000
paginationToken = None
finished = False
while finished == False:
response = None
if self.processType == ProcessType.ANALYSIS:
if paginationToken == None:
response = self.textract.get_document_analysis(JobId=jobId,
MaxResults=maxResults)
else:
response = self.textract.get_document_analysis(JobId=jobId,
MaxResults=maxResults,
NextToken=paginationToken)
if self.processType == ProcessType.DETECTION:
if paginationToken == None:
response = self.textract.get_document_text_detection(JobId=jobId,
MaxResults=maxResults)
else:
response = self.textract.get_document_text_detection(JobId=jobId,
MaxResults=maxResults,
NextToken=paginationToken)
blocks = response['Blocks']
print('Detected Document Text')
print('Pages: {}'.format(response['DocumentMetadata']['Pages']))
# Display block information
for block in blocks:
self.DisplayBlockInfo(block)
print()
print()
if 'NextToken' in response:
paginationToken = response['NextToken']
else:
finished = True
def GetResultsDocumentAnalysis(self, jobId):
maxResults = 1000
paginationToken = None
finished = False
while finished == False:
response = None
if paginationToken == None:
response = self.textract.get_document_analysis(JobId=jobId,
MaxResults=maxResults)
else:
response = self.textract.get_document_analysis(JobId=jobId,
MaxResults=maxResults,
NextToken=paginationToken)
# Get the text blocks
blocks = response['Blocks']
print('Analyzed Document Text')
print('Pages: {}'.format(response['DocumentMetadata']['Pages']))
# Display block information
for block in blocks:
self.DisplayBlockInfo(block)
print()
print()
if 'NextToken' in response:
paginationToken = response['NextToken']
else:
finished = True
def main():
bucket = 'poyocr'
document = '018394d5-0dd80d.pdf'
region_name = 'us-west-1'
roleArn='an:as:iam::58:usr/tect'
analyzer = DocumentProcessor(roleArn, bucket, document, region_name)
analyzer.CreateTopicandQueue()
analyzer.ProcessDocument(ProcessType.ANALYSIS)
analyzer.DeleteTopicandQueue()
if __name__ == "__main__":
main()
Getting error "TypeError: 'list' object is not callable", I have changed the parenthesis into square bracket but still not working.
def getColumnType(value):
dataType = None;
try:
numericVal = int(value)
dataType =int;
except ValueError:
numericVal = float(value)
dataType = float;
return dataType
def extractColFromFile(fileName,colIdx):
try:
columnName = ""
datalist = []
with open(fileName) as file:
firstLine = file.readline().strip()
colCount = firstLine.count(',') + 1
contents = [x.strip() for x in file.readlines()]
floatCount = 0
if colIdx in range(0,colCount):
columnName = firstLine.split(',')[colIdx]
for line in contents:
columns = line.split(',')
for idx,data in enumerate(columns):
if (idx == colIdx):
if(getColumnType(data) == float):
floatCount += 1
datalist.append(data)
if( floatCount > 0):
classType = float
else:
classType = int
result = list(map(classType, datalist))
return columnName, result
else:
print("invalid column index value");
except OSError as e:
print(e)
x = int(input("Enter a number: "))
result = extractColFromFile("task1.csv",x)
print (result)
Any ideas on how to make it so that I dont get Error 429: Too Many Request
Just a beginner here, I am trying to get ipData from my database which consists of Office365 Audit data..
Am i using the wrong library for such task?
def ipInfo(addr=''):
from urllib.request import urlopen
from json import load
if addr == '':
url = 'https://ipinfo.io/json'
else:
url = 'https://ipinfo.io/' + addr + '/json'
res = urlopen(url)
JSONtext = ''
#response from url(if res==None then check connection)
data = load(res)
#will load the json response into data
for attr in data.keys():
#will print the data line by line
if attr == 'ip':
JSONtext = JSONtext + '{' + '"' + ''.join(attr) + '"' + ':' + '"' + ''.join(data[attr]) + '"'
elif attr == 'readme':
JSONtext = JSONtext + '"'+ ''.join(attr) + '"'+':'+'"' + ''.join(data[attr]) + '"' + '}'
else:
JSONtext = JSONtext + '"'+ ''.join(attr) +'"'+':'+'"'+ ''.join(data[attr]) + '"'
return JSONtext
#get table list
crsr = connection().cursor()
crsr.execute(
"SELECT id,creationdate, userids, operations,auditdata ->> 'ClientIP' AS client_ip,ipdata FROM audits WHERE operations ILIKE '%login%' LIMIT 5;")
tpl = crsr.fetchall()
crsr.close()
#append list to dictionary
dict = {"id":[],"creationdate":[],"userids":[],"operations":[],"clientip":[],"ipdata":[]}
for items in tpl:
#converts item into list
datalist = list(items)
print ('Processing')
for i in datalist:
if i == datalist[0]:
dict["id"].append(i)
elif i == datalist[1]:
dict["creationdate"].append(i)
elif i == datalist[2]:
dict["userids"].append(i)
elif i == datalist[3]:
dict["operations"].append(i)
elif i == datalist[4]:
dict["clientip"].append(i)
ip = i
ip = ''.join(ip)
else:
dict["ipdata"].append(ipInfo(ip))
time.sleep(6)
print('Task Completed')
df = pd.DataFrame.from_dict(dict)
When the following bit of code runs, most specifically the last 'else'condition, I get this error: OSError: [Errno 22] Invalid argument: 'My Name\n-Groups.txt'
What should I do so that '\n' isn't included in the file name as I would like it to just be 'My Name-Groups.txt'.
def add_child_to_group():
file = open("Children.txt", 'r') # open the Children.txt file
lineList = file.readlines()
lineList.sort()
file.close()
choice1 = choicebox('Choose a child to enter into a group.', 'Add child to a group. ', choices=lineList)
if choice1 is None:
print("You cancelled... returning to the main menu.")
main()
return
else:
file = open("Groups.txt", 'r')
lineList = [line.strip() for line in file]
choice2 = choicebox("Which group would you like to add the child to?", "Choose a group.",
choices=lineList)
file.close()
if choice2 is None:
print("You cancelled... returning to the main menu.")
main()
return
else:
if choice1 in open('%s.txt' % choice2).read():
child_already_in_group(choice1, choice2)
return
else:
file1 = open('%s.txt' % choice2, 'a')
file1.write(str(choice1))
print(str(choice1) + "was added to the " + str(choice2) + " group")
file1.close()
file2 = open('%s-Groups.txt' % choice1, 'a')
file2.write(str(choice2))
Something like this can do:
>>> st = 'My Name\n-Groups.txt'
>>> st.replace('\n','')
'My Name-Groups.txt'
>>>
So, in your code, you can make the following change:
file2 = open(('%s-Groups.txt' % choice1).replace('\n',''), 'a')
I am trying to use the following code to filter a txt file based on the info in 89,90,91 and 92 index.
The problem is that output file is an empty file just with headers.The code is not giving any error-so am not sure how else to go about debugging it.
Thanks for helping!!
for line in fileHandle:
if firstLineFlag == 0: #to skip first line
firstLineFlag = 1
firstLineText = line #save the first line elsewhere
continue
parts = line.strip().split('\t')
try:
column13=float(parts[13-1])
except ValueError:
column13=0
if column13 < 0.01:
if parts[92] == "./.:.:.:.:.":
Nor_info_2 = parts[92].replace("./.:.:.:.:.", "00:1,1:1:1:1,1,1")
if parts[91] == "./.:.:.:.:.":
Nor_info = parts[91].replace("./.:.:.:.:.", "00:1,1:1:1:1,1,1")
if parts[90] == "./.:.:.:.:.":
Tu_info_2 = parts[90].replace("./.:.:.:.:.", "00:1,1:1:1:1,1,1")
if parts[89] == "./.:.:.:.:.":
Tu_info = parts[89].replace("./.:.:.:.:.", "00:1,1:1:1:1,1,1")
normalSplit_2 = parts[92].split(':')
normalSplit = parts[91].split(':')
tumorSplit_2 = parts[90].split(':')
tumorSplit = parts[89].split(':')
print(Nor_info_2)
try:
TD_Tumor_2 = float(tumorSplit_2[3-1])
except ValueError:
TD_Tumor = 0
try:
TD_Tumor = float(tumorSplit[3-1])
except ValueError:
TD_Tumor = 0
try:
TD_Normal_2 = float(normalSplit_2[3-1])
except ValueError:
TD_Tumor = 0
try:
TD_Normal = float(normalSplit[3-1])
except ValueError:
TD_Tumor = 0
if TD_Tumor_2 >= TD_Tumor and TD_Tumor_2 >= 7:
tumorAD=tumorSplit_2[2-1].split(',')
normalAD=normalSplit_2[2-1].split(',')
normalratio=float(normalAD[2-1])/TD_Normal_2
else:
tumorAD=tumorSplit[2-1].split(',')
normalAD=normalSplit[2-1].split(',')
normalratio=float(normalAD[2-1])/TD_Normal
tumorratio=float(tumorAD[2-1])/TD_Tumor_2
parts.append(tumorratio)
parts.append(normalratio)
data.append(parts)
dataz1 = sorted(data, key = itemgetter(91), reverse = True)
#with open('filtered/'+currentFile+'_filtered.txt', 'w') as fileHandle: ## to write your data in proper format
with open(currentFile+'_filtered.txt', 'w') as fileHandle: ## to write your data in proper format
fileHandle.write(firstLineText)
for item in data:
convert_first_to_generator = (str(w) for w in item)
string = '\t'.join(convert_first_to_generator)
string += '\n'
#print string
fileHandle.write(string)
command = 'mv '+currentFile+'_filtered.txt filtered/' ### to move edited files into a different folder
system(command)