I am new to textract and trying to get data from pdf and see in the form of key and value pair. This document has around 15 pages. So I have used this asynchronous example to get data from pdf, but I am getting this error.
Error:
botocore.errorfactory.InvalidParameterException: An error occurred (InvalidParameterException) when calling the StartDocumentAnalysis operation: Request has invalid parameters
https://docs.aws.amazon.com/textract/latest/dg/async-analyzing-with-sqs.html.
import boto3
import json
import sys
import time
class ProcessType:
DETECTION = 1
ANALYSIS = 2
class DocumentProcessor:
jobId = ''
region_name = ''
roleArn = ''
bucket = ''
document = ''
sqsQueueUrl = ''
snsTopicArn = ''
processType = ''
def __init__(self, role, bucket, document, region):
self.roleArn = role
self.bucket = bucket
self.document = document
self.region_name = region
self.textract = boto3.client('textract', region_name=self.region_name)
self.sqs = boto3.client('sqs', region_name=self.region_name)
self.sns = boto3.client('sns', region_name=self.region_name)
def ProcessDocument(self, type):
jobFound = False
self.processType = type
validType = False
# Determine which type of processing to perform
if self.processType == ProcessType.DETECTION:
response = self.textract.start_document_text_detection(
DocumentLocation={'S3Object': {'Bucket': self.bucket, 'Name': self.document}},
NotificationChannel={'RoleArn': self.roleArn, 'SNSTopicArn': self.snsTopicArn})
print('Processing type: Detection')
validType = True
# For document analysis, select which features you want to obtain with the FeatureTypes argument
if self.processType == ProcessType.ANALYSIS:
response = self.textract.start_document_analysis(
DocumentLocation={'S3Object': {'Bucket': self.bucket, 'Name': self.document}},
FeatureTypes=["TABLES", "FORMS"],
NotificationChannel={'RoleArn': self.roleArn, 'SNSTopicArn': self.snsTopicArn})
print('Processing type: Analysis')
validType = True
if validType == False:
print("Invalid processing type. Choose Detection or Analysis.")
return
print('Start Job Id: ' + response['JobId'])
dotLine = 0
while jobFound == False:
sqsResponse = self.sqs.receive_message(QueueUrl=self.sqsQueueUrl, MessageAttributeNames=['ALL'],
MaxNumberOfMessages=10)
if sqsResponse:
if 'Messages' not in sqsResponse:
if dotLine < 40:
print('.', end='')
dotLine = dotLine + 1
else:
print()
dotLine = 0
sys.stdout.flush()
time.sleep(5)
continue
for message in sqsResponse['Messages']:
notification = json.loads(message['Body'])
textMessage = json.loads(notification['Message'])
print(textMessage['JobId'])
print(textMessage['Status'])
if str(textMessage['JobId']) == response['JobId']:
print('Matching Job Found:' + textMessage['JobId'])
jobFound = True
self.GetResults(textMessage['JobId'])
self.sqs.delete_message(QueueUrl=self.sqsQueueUrl,
ReceiptHandle=message['ReceiptHandle'])
else:
print("Job didn't match:" +
str(textMessage['JobId']) + ' : ' + str(response['JobId']))
# Delete the unknown message. Consider sending to dead letter queue
self.sqs.delete_message(QueueUrl=self.sqsQueueUrl,
ReceiptHandle=message['ReceiptHandle'])
print('Done!')
def CreateTopicandQueue(self):
millis = str(int(round(time.time() * 1000)))
# Create SNS topic
snsTopicName = "AmazonTextractTopic" + millis
topicResponse = self.sns.create_topic(Name=snsTopicName)
self.snsTopicArn = topicResponse['TopicArn']
# create SQS queue
sqsQueueName = "AmazonTextractQueue" + millis
self.sqs.create_queue(QueueName=sqsQueueName)
self.sqsQueueUrl = self.sqs.get_queue_url(QueueName=sqsQueueName)['QueueUrl']
attribs = self.sqs.get_queue_attributes(QueueUrl=self.sqsQueueUrl,
AttributeNames=['QueueArn'])['Attributes']
sqsQueueArn = attribs['QueueArn']
# Subscribe SQS queue to SNS topic
self.sns.subscribe(
TopicArn=self.snsTopicArn,
Protocol='sqs',
Endpoint=sqsQueueArn)
# Authorize SNS to write SQS queue
policy = """{{
"Version":"2012-10-17",
"Statement":[
{{
"Sid":"MyPolicy",
"Effect":"Allow",
"Principal" : {{"AWS" : "*"}},
"Action":"SQS:SendMessage",
"Resource": "{}",
"Condition":{{
"ArnEquals":{{
"aws:SourceArn": "{}"
}}
}}
}}
]
}}""".format(sqsQueueArn, self.snsTopicArn)
response = self.sqs.set_queue_attributes(
QueueUrl=self.sqsQueueUrl,
Attributes={
'Policy': policy
})
def DeleteTopicandQueue(self):
self.sqs.delete_queue(QueueUrl=self.sqsQueueUrl)
self.sns.delete_topic(TopicArn=self.snsTopicArn)
# Display information about a block
def DisplayBlockInfo(self, block):
print("Block Id: " + block['Id'])
print("Type: " + block['BlockType'])
if 'EntityTypes' in block:
print('EntityTypes: {}'.format(block['EntityTypes']))
if 'Text' in block:
print("Text: " + block['Text'])
if block['BlockType'] != 'PAGE' and "Confidence" in str(block['BlockType']):
print("Confidence: " + "{:.2f}".format(block['Confidence']) + "%")
print('Page: {}'.format(block['Page']))
if block['BlockType'] == 'CELL':
print('Cell Information')
print('\tColumn: {} '.format(block['ColumnIndex']))
print('\tRow: {}'.format(block['RowIndex']))
print('\tColumn span: {} '.format(block['ColumnSpan']))
print('\tRow span: {}'.format(block['RowSpan']))
if 'Relationships' in block:
print('\tRelationships: {}'.format(block['Relationships']))
if ("Geometry") in str(block):
print('Geometry')
print('\tBounding Box: {}'.format(block['Geometry']['BoundingBox']))
print('\tPolygon: {}'.format(block['Geometry']['Polygon']))
if block['BlockType'] == 'SELECTION_ELEMENT':
print(' Selection element detected: ', end='')
if block['SelectionStatus'] == 'SELECTED':
print('Selected')
else:
print('Not selected')
if block["BlockType"] == "QUERY":
print("Query info:")
print(block["Query"])
if block["BlockType"] == "QUERY_RESULT":
print("Query answer:")
print(block["Text"])
def GetResults(self, jobId):
maxResults = 1000
paginationToken = None
finished = False
while finished == False:
response = None
if self.processType == ProcessType.ANALYSIS:
if paginationToken == None:
response = self.textract.get_document_analysis(JobId=jobId,
MaxResults=maxResults)
else:
response = self.textract.get_document_analysis(JobId=jobId,
MaxResults=maxResults,
NextToken=paginationToken)
if self.processType == ProcessType.DETECTION:
if paginationToken == None:
response = self.textract.get_document_text_detection(JobId=jobId,
MaxResults=maxResults)
else:
response = self.textract.get_document_text_detection(JobId=jobId,
MaxResults=maxResults,
NextToken=paginationToken)
blocks = response['Blocks']
print('Detected Document Text')
print('Pages: {}'.format(response['DocumentMetadata']['Pages']))
# Display block information
for block in blocks:
self.DisplayBlockInfo(block)
print()
print()
if 'NextToken' in response:
paginationToken = response['NextToken']
else:
finished = True
def GetResultsDocumentAnalysis(self, jobId):
maxResults = 1000
paginationToken = None
finished = False
while finished == False:
response = None
if paginationToken == None:
response = self.textract.get_document_analysis(JobId=jobId,
MaxResults=maxResults)
else:
response = self.textract.get_document_analysis(JobId=jobId,
MaxResults=maxResults,
NextToken=paginationToken)
# Get the text blocks
blocks = response['Blocks']
print('Analyzed Document Text')
print('Pages: {}'.format(response['DocumentMetadata']['Pages']))
# Display block information
for block in blocks:
self.DisplayBlockInfo(block)
print()
print()
if 'NextToken' in response:
paginationToken = response['NextToken']
else:
finished = True
def main():
bucket = 'poyocr'
document = '018394d5-0dd80d.pdf'
region_name = 'us-west-1'
roleArn='an:as:iam::58:usr/tect'
analyzer = DocumentProcessor(roleArn, bucket, document, region_name)
analyzer.CreateTopicandQueue()
analyzer.ProcessDocument(ProcessType.ANALYSIS)
analyzer.DeleteTopicandQueue()
if __name__ == "__main__":
main()
Related
run python.py and show this error
import boto3, datetime, sys, getopt, re
from operator import itemgetter
from table_logger import TableLogger
class buck:
TARIFAWS = 0.023 # 0,023 USD por GB (Primeiros 50 TB por mês)
counter = 0
def __init__(self, s3bucket):
buck.counter = buck.counter + 1
self.name = s3bucket.name
self.creationdate = s3bucket.creation_date
self.size = self.metricCloudwatch(s3bucket,"BucketSizeBytes", "StandardStorage")
self.nbreObj = self.metricCloudwatch(s3bucket,"NumberOfObjects", "AllStorageTypes")
try:
boto3.client('s3').get_bucket_encryption(Bucket=s3bucket.name)
self.number = True
except:
self.number = False
self.region = (boto3.client('s3').get_bucket_location(Bucket=s3bucket.name))['LocationConstraint']
self.cout = round(self.size / 1024**3 * self.TARIFAWS,2)
try:
boto3.client('s3').get_bucket_replication(Bucket=s3bucket.name)
self.replica = True
except:
self.replica = False
def collObjInfo(self):
s3obj = (boto3.client('s3')).list_objects_v2(Bucket=self.name)
self.lastupdate = None
self.typeStorage = None
if s3obj['KeyCount'] != 0:
self.lastupdate = s3obj['Contents'][0]['LastModified']
self.typeStorage = s3obj['Contents'][0]['StorageClass']
collObjInfo(self)
self.public = False
def __str__(self):
return str(self.__class__) + ": " + str(self.__dict__)
def __getitem__(self, key):
if key == 'region':
return self.region
if key == 'typeStorage':
return self.typeStorage
def getSize(self, human=False):
if human:
return humanReadable(self.size)
else:
return self.size
def metricCloudwatch(self, bucket, nameMetric, storage):
cloudwatch = boto3.client('cloudwatch')
now = datetime.datetime.now()
try:
cloudwatch_size = cloudwatch.get_metric_statistics(
Namespace='AWS/S3',
MetricName=nameMetric,
Dimensions=[
{'Name': 'BucketName', 'Value': bucket.name},
{'Name': 'StorageType', 'Value': storage}
],
Statistics=['Maximum'],
Period=86400,
StartTime=(now - datetime.timedelta(days=1)).isoformat(),
EndTime=now.isoformat()
)
if cloudwatch_size["Datapoints"]:
return cloudwatch_size["Datapoints"][0]['Maximum']
else:
return 0
except:
return 0
def humanReadable(num, suffix='B'):
for unit in ['','K','M','G','T','P']:
if abs(num) < 1024.0:
return "%3.1f%s%s" % (num, unit, suffix)
num /= 1024.0
return "%.1f%s%s" % (num, 'Yi', suffix)
def help():
print("Uso : bucket.py [OPTIONS]")
print("Exibe informações sobre buckets AWS S3, por padrão")
print("Argumentos : \n\
--help \t\t\t ajuda\n\
--crypted-only \t\t mostra apenas buckets criptografados\n\
-c, --csv \t\t mostra o resultado em CSV\n\
-s, --sorted \t\t agrupar resultados por região e grupo de armazenamento\n\
-h, --human-readable \t exibem tamanhos de 1024\n\
-f, --filter=filter \t filtra a lista de buckets com base na expressão regular FILTER")
def main():
csv=False
human = False
group = False
filterCrpt = False
filter = None
try:
opts, args = getopt.getopt(sys.argv[1:], "shcf:", ["sorted", "help", "csv", "human-readable", "crypted-only", "filter:"])
except:
print("Comando incorreto, aqui está a ajuda: ")
help()
sys.exit(2)
for opts, args in opts:
if opts == "--help":
help()
sys.exit()
elif opts == "--crypted-only":
filterCrpt = True
elif opts in ("-c", "--csv"):
csv = True
elif opts in ("-s", "--sorted"):
group = True
elif opts in ("-h", "--human-readable"):
human = True
elif opts in ("-f", "--filter"):
if len(args):
filter = args
else:
help()
sys.exit(2)
s3 = boto3.resource('s3')
bucks = []
listeS3Bucks = s3.buckets.all()
for bucket in listeS3Bucks:
try:
if filter:
re.match(filter,"Test chain")
except:
print("Regular expression error")
sys.exit(2)
if (filter and re.match(filter,bucket.name)) or not filter:
try:
bucks.append(buck(bucket))
except:
print("Erro ao conectar ao AWS, verifique suas configurações")
print("Para obter mais informações: https://docs.aws.amazon.com/cli/latest/userguide/cli-config-files.html")
sys.exit(2)
if group:
bucks = sorted(bucks, key=itemgetter('region'))
bucks = sorted(bucks, key=itemgetter('typeStorage'))
tbl = TableLogger(columns='name,creation date,last update,size,number of objects,number,storage,public,region,cost,replica',
csv=csv, border=False)
for cBuck in bucks:
if (filterCrpt and cBuck.number) or not filterCrpt:
tbl(cBuck.name, cBuck.creationdate, cBuck.lastupdate, cBuck.getSize(human), str(cBuck.nbreObj),
cBuck.number,cBuck.typeStorage, cBuck.public, cBuck.region, "$"+str(cBuck.cout),cBuck.replica)
if __name__ == "__main__":
main()
when trying to run the script in python, it gives this error:
TypeError: unsupported format string passed to NoneType.format
can someone help me? is a script that pulls information from aws s3 bucket.
Bucket is a python script to extract statistics from S3 buckets. It is based on Cloudwatch monitoring metrics.
Help plis
File "C:\Python39\lib\site-packages\table_logger\table_logger.py", line 203, in __call__
line = self.format_row(*row_cells)
File "C:\Python39\lib\site-packages\table_logger\table_logger.py", line 207, in format_row
vals = [self.format_column(value, col) for col, value in enumerate(args)]
File "C:\Python39\lib\site-packages\table_logger\table_logger.py", line 207, in <listcomp>
vals = [self.format_column(value, col) for col, value in enumerate(args)]
File "C:\Python39\lib\site-packages\table_logger\table_logger.py", line 212, in format_column
return self.formatters[col](value)
File "C:\Python39\lib\site-packages\table_logger\fmt.py", line 40, in __call__
fmt = self.fmt(value)
TypeError: unsupported format string passed to NoneType.__format__
When I update the python 2.7 version to python 3.8, my AMI script is not working properly, after test the code on aws console under the lambda section shows versions mismatch between 2.7 version & 3.8 version. Anybody can help me solve this issue?
Thanks
import boto3
import collections
import datetime
import time
import sys
ec2 = boto3.client('ec2')
sts = boto3.client('sts')
drec2 = boto3.client('ec2', region_name='eu-west-1')
dr = boto3.resource('ec2')
awsaccountid = sts.get_caller_identity().get('Account')
images = dr.images.filter(Owners=[awsaccountid])
date = datetime.datetime.now()
date_fmt = date.strftime('%Y-%m-%d')
time_fmt = date.strftime('%Y-%m-%d--%H-%M-%S')
def lambda_handler(event, context):
def get_instance_name(itags):
instancename = ''
if itags is None:
instancename = ''
else:
for tags in itags:
if tags["Key"] == 'Name':
instancename = tags["Value"]
return instancename
# Check for the availability of the image
def is_image_available(image_id):
try:
available = 0
while available == 0:
print("Not created yet.. Gonna sleep for 10 seconds")
time.sleep(10)
image = ec2.describe_images(ImageIds=[image_id])
if image['Images'][0]['State'] == 'available':
available = 1
if available == 1:
print("Image is now available for use.")
return True
except Exception,e:
print e
reservations = ec2.describe_instances(
Filters=[
{'Name': 'tag:Backup', 'Values': ['yes', 'Yes']},
]
).get(
'Reservations', []
)
instances = sum(
[
[i for i in r['Instances']]
for r in reservations
], [])
for instance in instances:
try:
retention_days = 2
except IndexError:
retention_days = 7
finally:
for image in images:
#today_time = datetime.datetime.now().strftime('%m-%d-%Y')
#today_fmt = today_time.strftime('%m-%d-%Y')
#today_date = time.strptime(today_time, '%m-%d-%Y')
aminame = get_instance_name(instance['Tags']) + "-" + instance['InstanceId'] + "-On-" + date_fmt
if image.name == aminame:
if is_image_available(image.id):
dramiid = drec2.copy_image(SourceRegion='us-east-1', SourceImageId=image.id,
Name="DR-" + get_instance_name(instance['Tags']) + "-" +
`enter code here` instance[
'InstanceId'] + "-On-" + date_fmt,
Description="Lambda created AMI of instance " + instance[
'InstanceId'] + " On " + time_fmt)
print("Retaining DR AMI %s of instance %s for %d days" % (
dramiid['ImageId'],
instance['InstanceId'],
retention_days,
))
# to_tag[retention_days].append(dramiid['ImageId'])
# for retention_days in to_tag.keys():
delete_date = datetime.date.today() + datetime.timedelta(days=retention_days)
delete_fmt = delete_date.strftime('%m-%d-%Y')
print("Will delete the DR AMI in %d days on %s" % (retention_days, delete_fmt))
# break
drec2.create_tags(
Resources=[dramiid['ImageId']],
Tags=[
{'Key': 'DeleteOn', 'Value': delete_fmt},
]
)
It is giving the following error:
Function Logs:
START
RequestId: 1f9263c7-9736-4b31-934d-719d38f8fdba
Version: $LATEST
[ERROR] Runtime.UserCodeSyntaxError: Syntax error in module 'lambda_function': invalid syntax (lambda_function.py, line 52)
Traceback (most recent call last): File "/var/task/lambda_function.py"
Line 52 except Exception,e:END
RequestId: 1f9263c7-9736-4b31-934d-719d38f8fdba
Do anyone know how to solve this issue?
you should do some syntax changes when you are running your script in python3.
For e.g
in python 3 , you say `print('value')` with brackets.
in python 2, you say `print "hello"`
in python 3, you say `except Exception as e`
in python 2, you can say `except Exception, e`
here:
import boto3
import collections
import datetime
import time
import sys
ec2 = boto3.client('ec2')
sts = boto3.client('sts')
drec2 = boto3.client('ec2', region_name='eu-west-1')
dr = boto3.resource('ec2')
awsaccountid = sts.get_caller_identity().get('Account')
images = dr.images.filter(Owners=[awsaccountid])
date = datetime.datetime.now()
date_fmt = date.strftime('%Y-%m-%d')
time_fmt = date.strftime('%Y-%m-%d--%H-%M-%S')
def lambda_handler(event, context):
def get_instance_name(itags):
instancename = ''
if itags is None:
instancename = ''
else:
for tags in itags:
if tags["Key"] == 'Name':
instancename = tags["Value"]
return instancename
# Check for the availability of the image
def is_image_available(image_id):
try:
available = 0
while available == 0:
print("Not created yet.. Gonna sleep for 10 seconds")
time.sleep(10)
image = ec2.describe_images(ImageIds=[image_id])
if image['Images'][0]['State'] == 'available':
available = 1
if available == 1:
print("Image is now available for use.")
return True
except Exception as e:
print(e)
reservations = ec2.describe_instances(
Filters=[
{'Name': 'tag:Backup', 'Values': ['yes', 'Yes']},
]
).get(
'Reservations', []
)
instances = sum(
[
[i for i in r['Instances']]
for r in reservations
], [])
for instance in instances:
try:
retention_days = 2
except IndexError:
retention_days = 7
finally:
for image in images:
#today_time = datetime.datetime.now().strftime('%m-%d-%Y')
#today_fmt = today_time.strftime('%m-%d-%Y')
#today_date = time.strptime(today_time, '%m-%d-%Y')
aminame = get_instance_name(instance['Tags']) + "-" + instance['InstanceId'] + "-On-" + date_fmt
if image.name == aminame:
if is_image_available(image.id):
dramiid = drec2.copy_image(SourceRegion='us-east-1', SourceImageId=image.id,
Name="DR-" + get_instance_name(instance['Tags']) + "-" +
'enter code here' + instance[
'InstanceId'] + "-On-" + date_fmt,
Description="Lambda created AMI of instance " + instance[
'InstanceId'] + " On " + time_fmt)
print("Retaining DR AMI %s of instance %s for %d days" % (
dramiid['ImageId'],
instance['InstanceId'],
retention_days,
))
# to_tag[retention_days].append(dramiid['ImageId'])
# for retention_days in to_tag.keys():
delete_date = datetime.date.today() + datetime.timedelta(days=retention_days)
delete_fmt = delete_date.strftime('%m-%d-%Y')
print("Will delete the DR AMI in %d days on %s" % (retention_days, delete_fmt))
# break
drec2.create_tags(
Resources=[dramiid['ImageId']],
Tags=[
{'Key': 'DeleteOn', 'Value': delete_fmt},
]
)
I'm trying to create Scrapy-app, when the app execute the function 'parse' it goes well(status=200), but when it call 'parse_phone' it can't get that url and logs with the errors.
When i execute 'scrapy shell 'url that is in parse_phone' it executes without problems.
Can anyone answer what is solution?
class DeviceSpider(scrapy.Spider):
name = 'device'
start_urls = [
'https://www.gsmarena.com/makers.php3',
]
def parse(self, response):
for href in response.css('.st-text a::attr(href)'):
time.sleep(random.randint(30,50))
yield response.follow(href, self.parse_phones)
def parse_phones(self, response):
for href in response.css('#review-body a::attr(href)'):
time.sleep(random.randint(30,50))
yield response.follow(href, self.parse_device_info)
next_page = response.css('.pages-next::attr(href)').extract_first()
if next_page is not None:
time.sleep(random.randint(30,50))
next_page = response.urljoin(next_page)
yield response.follow(href, self.parse_phones)
def parse_device_info(self, response):
price = response.css('td[data-spec=price]::text').get()()
models = response.css('td[data-spec=models]::text').get(
name = response.css('.specs-phone-name-title::text').get)
Object.objects.create(name='werw')
if models:
models = models.split(', ')
else:
models = ['{}'.format(name)]
launch = response.css('td[data-spec=year]::text').get()
if launch:
launch = launch.split(', ')
obj = Object.objects.create(name=name, launch=datetime.date(launch[0], datetime.datetime.strptime(launch[1][:3]).month, 1))
else:
obj = Object.objects.create(name=name)
for m in models:
item = Item.objects.create(obj=obj, name=m)
if price:
if 'euro' in p.lower():
Attr.objects.create(item=item, price=Money(float(p.split(' ')[1]), 'EUR'))
else:
price = price.split(' / ')
price = list(map(lambda x: x.replace('\u2009', ''),price))
for p in prices:
price_currency = p[0]
price_amount = float(p.split('{}'.format(p[0]))[1])
bank_symbols_currency = ''
if price_currency == '$':
bank_symbols_currency = 'USD'
elif price_currency == '€':
bank_symbols_currency = 'EUR'
elif price_currency == '£':
bank_symbols_currency = 'GBP'
elif price_currency == '₹':
bank_symbols_currency = 'INR'
if bank_symbols_currency:
Attr.objects.create(item=item, price=Money(price_amount, bank_symbols_currency))
I tried with User-agent but it doesn't change the situation.
I am new to Python (2 month of programing/learining exp in total).
In this code all i do is get some data from MSSQL database and transfer it to DynamDB.
I just want to know why i getting this error: ValueError: ctypes objects containing pointers cannot be pickled
Its happaning it this line : p.map(self.import_sample_data_dynamo, list_to_batch).
import_sample_data_dynamo: its a function for Dynamo batch.
list_to_batch: its a list of dictionaries.
can some one plz tell me what I am doint wrong.
class GetSensorsSamplesSetToDynamoDBTable:
def __init__(self):
self.client = None
self.db = None
# MSSQL
self.connection = None
self.cursor: Cursor = None
def init(self):
# MSSQL
connect_string = 'Driver={SQL Server};' \
'Server=xxxx;' 'Database=xxx;' \
'uid=xxx;pwd=xxx'
self.connection = pypyodbc.connect(connect_string)
self.cursor = self.connection.cursor()
# DynamoDB
dynamodb = boto3.resource('dynamodb')
self.table = dynamodb.Table('xxx')
def cleanup(self):
# MSSQL
self.cursor.close()
self.connection.close()
def do_work(self):
self.init()
data = []
samples = self.get_files_received_by_ftp_prod2_data()
for sample in samples:
sample_id = sample['id']
project_id = sample['projectid']
sensor_id = sample['sensorid']
sample_time = sample['sampletime']
row = {"_id": sample_id, 'ProjectID': project_id, 'SensorID': sensor_id,
'Sample_Time': sample_time,
'Z_Fields': sample}
data.append(row)
self.split_for_batch(data)
# self.import_sample_data_dynamo(data)
def get_files_received_by_ftp_prod2_data(self):
sql_cmd = f"SELECT TOP (1000) * FROM FilesReceivedByFTP_Prod2"
self.cursor.execute(sql_cmd)
records = self.cursor.fetchall()
samples = []
records = list(records)
for res in records:
samples_data = {self.cursor.description[i][0]: res[i] for i in range(len(res))}
self.fix_bad_fields(samples_data)
samples.append(samples_data)
return samples
def split_for_batch(self, data):
temp_list = []
list_to_batch = []
while len(data) > 0:
temp_list.append(data[0])
data.pop(0)
if len(temp_list) > 24 or len(data) is 0:
list_to_batch.append(temp_list)
temp_list = []
print(len(data))
print(len(list_to_batch))
start_time = time.time()
num_workers = multiprocessing.cpu_count()
p = Pool(num_workers - 1)
p.map(self.import_sample_data_dynamo, list_to_batch)
p.close()
p.join()
elapsed = time.time() - start_time
print(f"read_all_samples elapsed {elapsed:.0F} Seconds")
def import_sample_data_dynamo(self, data):
with self.table.batch_writer() as batch:
for item in data:
ddb_data = json.loads(json.dumps(item, default=json_util.default),
parse_float=Decimal, object_hook=json_util.object_hook)
batch.put_item(Item=ddb_data)
return True
def fix_bad_fields(self, data):
for k, v in data.items():
if v == '':
data[k] = '---'
# elif type(v) == type(datetime.datetime.now()):
# # data[k] = v.strftime("%d/%m/%Y, %H:%M:%S")
# data[k] = v.timestamp()
elif type(v) is bytearray:
data[k] = "bytearray"
if __name__ == '__main__':
freeze_support()
worker = GetSensorsSamplesSetToDynamoDBTable()
worker.do_work()
Tried to do a script that will pull all the clients on my meraki organization. An error occurred I think the file that I want to iterate cant be read by my code.
Thanks,
I tried changing the shard to specific tried installing modules that i think required but still same error
import sys, getopt, requests, json, time, datetime, os, sqlite3
#SECTION: GLOBAL VARIABLES: MODIFY TO CHANGE SCRIPT BEHAVIOUR
API_EXEC_DELAY = 0.21 #Used in merakirequestthrottler() to avoid hitting dashboard API max request rate
#connect and read timeouts for the Requests module in seconds
REQUESTS_CONNECT_TIMEOUT = 90
REQUESTS_READ_TIMEOUT = 90
#SECTION: GLOBAL VARIABLES AND CLASSES: DO NOT MODIFY
LAST_MERAKI_REQUEST = datetime.datetime.now() #used by merakirequestthrottler()
ARG_APIKEY = '' #DO NOT STATICALLY SET YOUR API KEY HERE
ARG_ORGNAME = '' #DO NOT STATICALLY SET YOUR ORGANIZATION NAME HERE
ORG_LIST = None #list of organizations, networks and MRs the used API key has access to
DEVICE_DB = None #SQLite3 database of all network devices
MAX_CLIENT_TIMESPAN = 2592000 #maximum timespan GET clients Dashboard API call supports
class c_Net:
def __init__(self):
id = ''
name = ''
shard = 'n132.meraki.com'
devices = []
class c_Organization:
def __init__(self):
id = ''
name = ''
shard = 'n132.meraki.com'
nets = []
#SECTION: General use functions
def merakirequestthrottler():
#makes sure there is enough time between API requests to Dashboard not to hit shaper
global LAST_MERAKI_REQUEST
if (datetime.datetime.now()-LAST_MERAKI_REQUEST).total_seconds() < (API_EXEC_DELAY):
time.sleep(API_EXEC_DELAY)
LAST_MERAKI_REQUEST = datetime.datetime.now()
return
def printhelp():
print(readMe)
#SECTION: Meraki Dashboard API communication functions
def getInventory(p_org):
#returns a list of all networks in an organization
merakirequestthrottler()
try:
r = requests.get('https://%s/api/v0/organizations/%s/inventory' % (p_org.shard, p_org.id), headers={'X-Cisco-Meraki-API-Key': ARG_APIKEY, 'Content-Type': 'application/json'}, timeout=(REQUESTS_CONNECT_TIMEOUT, REQUESTS_READ_TIMEOUT) )
except:
print('ERROR 01: Unable to contact Meraki cloud')
return(None)
if r.status_code != requests.codes.ok:
return(None)
return(r.json())
def getNetworks(p_org):
#returns a list of all networks in an organization
merakirequestthrottler()
try:
r = requests.get('https://%s/api/v0/organizations/%s/networks' % (p_org.shard, p_org.id), headers={'X-Cisco-Meraki-API-Key': ARG_APIKEY, 'Content-Type': 'application/json'}, timeout=(REQUESTS_CONNECT_TIMEOUT, REQUESTS_READ_TIMEOUT) )
except:
print('ERROR 07: Unable to contact Meraki cloud')
return(None)
if r.status_code != requests.codes.ok:
return(None)
return(r.json())
def getOrgs():
#returns the organizations' list for a specified admin, with filters applied
merakirequestthrottler()
try:
r = requests.get('https://n132.meraki.com/api/v0/organizations', headers={'X-Cisco-Meraki-API-Key': ARG_APIKEY, 'Content-Type': 'application/json'}, timeout=(REQUESTS_CONNECT_TIMEOUT, REQUESTS_READ_TIMEOUT) )
except:
print('ERROR 02: Unable to contact Meraki cloud')
return(None)
if r.status_code != requests.codes.ok:
return(None)
rjson = r.json()
orglist = []
listlen = -1
if ARG_ORGNAME.lower() == '/all':
for org in rjson:
orglist.append(c_Organization())
listlen += 1
orglist[listlen].id = org['id']
orglist[listlen].name = org['name']
else:
for org in rjson:
if org['name'] == ARG_ORGNAME:
orglist.append(c_Organization())
listlen += 1
orglist[listlen].id = org['id']
orglist[listlen].name = org['name']
return(orglist)
def getShardHost(p_org):
#Looks up shard URL for a specific org. Use this URL instead of 'api.meraki.com'
# when making API calls with API accounts that can access multiple orgs.
#On failure returns None
merakirequestthrottler()
try:
r = requests.get('https://n132.meraki.com/api/v0/organizations/%s/snmp' % p_org.id, headers={'X-Cisco-Meraki-API-Key': ARG_APIKEY, 'Content-Type': 'application/json'}, timeout=(REQUESTS_CONNECT_TIMEOUT, REQUESTS_READ_TIMEOUT) )
except:
print('ERROR 03: Unable to contact Meraki cloud')
return None
if r.status_code != requests.codes.ok:
return None
rjson = r.json()
return(rjson['hostname'])
def refreshOrgList():
global ORG_LIST
global DEVICE_DB
print('INFO: Starting org list refresh at %s...' % datetime.datetime.now())
flag_firstorg = True
orglist = getOrgs()
if not orglist is None:
for org in orglist:
print('INFO: Processing org "%s"' % org.name)
org.shard = 'n132.meraki.com'
orgshard = getShardHost(org)
if not orgshard is None:
org.shard = orgshard
netlist = getNetworks(org)
devlist = getInventory(org)
if not devlist is None and not netlist is None:
DEVICE_DB = sqlite3.connect(':memory:')
dbcursor = DEVICE_DB.cursor()
dbcursor.execute('''CREATE TABLE devices (serial text, name text, networkId text, mac text, type text, model text)''')
dbcursor.execute('''CREATE TABLE ouis (oui text)''')
DEVICE_DB.commit()
for device in devlist:
if not device['networkId'] is None:
devType = 'merakiDevice'
if device['model'][:2] in ['MR','MS','MX','Z1','Z3']:
devType = 'merakiNetworkDevice'
dbcursor.execute('''INSERT INTO devices VALUES (?,?,?,?,?,?)''', (device['serial'],device['name'],device['networkId'],device['mac'],devType,device['model']))
dbcursor.execute('''INSERT INTO ouis VALUES (?)''', (device['mac'][:8],))
DEVICE_DB.commit()
flag_firstnet = True
for net in netlist:
if net['type'] != 'systems manager': #ignore systems manager nets
dbcursor.execute('''SELECT serial, name, model FROM devices WHERE networkId = ? AND type = ?''', (net['id'],'merakiNetworkDevice'))
devicesofnet = dbcursor.fetchall()
if len(devicesofnet) > 0: #network has MR, MS, MX, Zx
if flag_firstnet:
if flag_firstorg:
ORG_LIST = []
lastorg = -1
flag_firstorg = False
ORG_LIST.append(org)
lastorg += 1
lastnet = -1
ORG_LIST[lastorg].nets = []
flag_firstnet = False
ORG_LIST[lastorg].nets.append(c_Net())
lastnet += 1
ORG_LIST[lastorg].nets[lastnet].id = net['id']
ORG_LIST[lastorg].nets[lastnet].name = net['name']
ORG_LIST[lastorg].nets[lastnet].shard = org.shard
ORG_LIST[lastorg].nets[lastnet].devices = []
for device in devicesofnet:
ORG_LIST[lastorg].nets[lastnet].devices.append(device)
LAST_ORGLIST_REFRESH = datetime.datetime.now()
print('INFO: Refresh complete at %s' % LAST_ORGLIST_REFRESH)
return None
def getclientlist(p_shardhost, p_serial, p_timespan):
merakirequestthrottler()
try:
r = requests.get('https://%s/api/v0/devices/%s/clients?timespan=%s' % (p_shardhost, p_serial, p_timespan), headers={'X-Cisco-Meraki-API-Key': ARG_APIKEY, 'Content-Type': 'application/json'}, timeout=(REQUESTS_CONNECT_TIMEOUT, REQUESTS_READ_TIMEOUT) )
except:
print('ERROR 04: Unable to contact Meraki cloud')
return(None)
if r.status_code != requests.codes.ok:
return(None)
return(r.json())
#SECTION: main
def main(argv):
global ARG_APIKEY
global ARG_ORGNAME
#initialize command line arguments
ARG_APIKEY = ''
ARG_ORGNAME = ''
arg_numresults = ''
arg_mode = ''
arg_filter = ''
#get command line arguments
try:
opts, args = getopt.getopt(argv, 'hk:o:m:')
except getopt.GetoptError:
printhelp()
sys.exit(2)
for opt, arg in opts:
if opt == '-h':
printhelp()
sys.exit()
elif opt == '-k':
ARG_APIKEY = arg
elif opt == '-o':
ARG_ORGNAME = arg
elif opt == '-m':
arg_mode = arg
#check that all mandatory arguments have been given
if ARG_APIKEY == '':
printhelp()
sys.exit(2)
#set defaults for empty command line arguments
if ARG_ORGNAME == '':
ARG_ORGNAME = '/all'
refreshOrgList()
if ORG_LIST is None or DEVICE_DB is None:
print('ERROR 05: No organizations with network devices access points for the specified API key')
sys.exit(2)
DEVcursor = DEVICE_DB.cursor()
for org in ORG_LIST:
flag_firstNet = True
orgClientList = []
reportFileName = 'clients_' + org.name + '_' + str(datetime.datetime.now()).replace(':','.') + '.csv'
print ('INFO: Processing org "%s"' % org.name)
for net in org.nets:
print ('INFO: Processing net "%s"' % net.name)
for dev in net.devices:
clients = getclientlist(org.shard, dev[0], MAX_CLIENT_TIMESPAN)
for client in clients:
DEVcursor.execute('''SELECT oui FROM ouis WHERE oui = ?''', (client['mac'][:8],))
matchingMerakiOuis = DEVcursor.fetchall()
if len(matchingMerakiOuis) == 0: #client device is not, in fact, a Meraki device neighbour
if flag_firstNet:
flag_firstNet = False
print('INFO: Creating file "' + reportFileName + '"')
try:
f = open(reportFileName, 'w')
f.write('id,mac,description,mdnsName,dhcpHostname,ip,vlan,switchport,usageKBSentToClient,usageKBRecvFromClient,networkId,networkName,reportedByDevSerial,reportedByDevName,reportedByDevModel\n')
except:
print('ERROR 06: Unable to open file "' + reportFileName + '" for writing')
sys.exit(2)
try:
f.write(str(client['id']) + ',' +
str(client['mac']) + ',' +
str(client['description']) + ',' +
str(client['mdnsName']) + ',' +
str(client['dhcpHostname']) + ',' +
str(client['ip']) + ',' +
str(client['vlan']) + ',' +
str(client['switchport']) + ',' +
str(int(client['usage']['sent'])) + ',' +
str(int(client['usage']['recv'])) + ',' +
str(net.id) + ',' +
str(net.name) + ',' +
str(dev[0]) + ',' +
str(dev[1]) + ',' +
str(dev[2]) + '\n' )
except:
print('ERROR 08: Unable to write to file "' + reportFileName + '"')
sys.exit(2)
DEVICE_DB.close()
try:
f.close()
except:
print ('INFO: Unable to close file (not open?)')
if __name__ == '__main__':
main(sys.argv[1:])
Traceback error:
Traceback (most recent call last):
File "orgsclientcsv.py", line 351, in <module>
main(sys.argv[1:])
File "orgsclientcsv.py", line 308, in main
for client in clients:
TypeError: 'NoneType' object is not iterable
The result should be an csv file that will give me list of phones mac address serial number manufacturer but the actual output is only phones mac address like Iphone