Downloading Reports in Amazon Advertising API - amazon

I got the Report Id and using the Report Id i could get S3 Downloadable link. But when i try to use the link it shows - Access Unauthorised.

You need to do a GET request against the download link. The URL requires authentication as well, so the authorization header must be passed too.
Edit I am adding a function that I created that downloads the GZipped file and extracts the Json in it, in case it helps anyone else:
import requests
import gzip
import json
import io
def report_download():
req = requests.get(url, headers=headers)
response = req.content
zip_file = io.BytesIO(response)
with gzip.open(zip_file, 'rb') as f:
file_content = f.read()
json_data = json.loads(file_content)
with open("filename.json", "w") as outfile:
json.dump(json_data, outfile)

Here's an easy way to download the report with python and pandas
def api_download_report_resp(access_token, profile_id, report_id):
url = f"https://advertising-api.amazon.com/v2/reports/{report_id}/download"
client_id = os.getenv("AMAZON_ADS_API_CLIENT_ID")
with requests.Session() as sess:
sess.headers["Amazon-Advertising-API-ClientId"] = client_id
sess.headers["Amazon-Advertising-API-Scope"] = profile_id
sess.headers["Authorization"] = f"Bearer {access_token}"
sess.headers["Content-Type"] = "application/json"
resp = sess.get(url)
return resp
resp = api_download_report_resp(access_token, profile_id, report_id)
print(resp)
# <Response [200]>
import pandas as pd
import io
fobj = io.BytesIO()
fobj.write(resp.content)
fobj.seek(0)
df = pd.read_json(fobj, compression='gzip')

Related

How to convert the URL response to a dataframe

I have been working on a requirement where i need to download the file from a website which outputs the data in csv format and then write it to a SQL table. I am using the below logic to download the data from a website.
import urllib
from bs4 import BeautifulSoup
url = "https://abcdef.oz/login"
response = opener.open(url)
if response.status == 200:
#webUrl = urllib.request.urlopen(url)
#print("Result code: " + str(webUrl.getcode()))
data = response.read().decode("utf-8")
soup = BeautifulSoup(data)
#Got a token
token = soup.find_all("meta", {"name":"token"}, limit=1)[0]['content']
print("token",token)
sdata = {"email": "abc#def.com", "password": "password"}
data = urllib.parse.urlencode(sdata).encode()
print("data",data)
url = "https://abcdef.oz.co/reports"
response = opener.open(url)
r = response.read().decode('utf-8')
print(r)
else:
print("No Response")
How can the response now be converted to a format where i can skip the header and write the data to a SQL table.
The output of the response is as below
"Col1","Col2","Col3"\n"abc","def","efg"\n"hij","klm","mno"
Thanks in advance
this is not mindblowing, but did you try:
df = pandas.read_html(url)

Convert Web url to Image

I am trying to take screenshot of an URL but somehow it takes the screenshot of the gateway because of restricted entry. So tried adding ID and password to open the link but it does not for reason, could you help?
import requests
import urllib.parse
BASE = 'https://mini.s-shot.ru/1024x0/JPEG/1024/Z100/?' # we can modify size, format, zoom as needed
url = 'https://mail.google.com/mail/'#or whatever link you need
url = urllib.parse.quote_plus(url) #
print(url)
Id="XXXXXX"
import getpass
key = getpass.getpass('Password :: ')
path = 'target1.jpg'
response = requests.get(BASE + url+Id+Password, stream=True)
if response.status_code == 200:
with open(path, 'wb') as file:
for chunk in response:
file.write(chunk)
Thanks!

how to download and iterate over csv file

I'm trying to download and iterate over csv file but I'm only reading the headers but no more lines after it
tried using this answer but with no luck
this is my code:
from datetime import datetime
import requests
import csv
def main():
print("python main function")
datetime_object = datetime.now().date()
url = f'https://markets.cboe.com/us/equities/market_statistics/volume_reports/day/{datetime_object}/csv/?mkt=bzx'
print(url)
response = requests.get(url, stream=True)
csv_content = response.content.decode('utf-8')
print(csv_content)
cr = csv.reader(csv_content.splitlines(), delimiter='~')
my_list = list(cr)
for row in my_list:
print(row)
if __name__ == '__main__':
main()
cr = csv.reader(csv_content.splitlines(), delimiter='~')
change to
cr = csv.reader(csv_content.splitlines(), delimiter=',')
And check if You download full file or file with header only use URL in browser ;)

Get text to csv format using python

I am able to get the data from pdf to text.
But now i need to get the data in csv format with table structure.
I tried it to get the table structure with but it didn't happen.Any inputs?
Also, i'm able to generate it through json.
Is there a way to get the result into table csv format?
any inputs ?
Below is the code i have used.
import boto3
import time
# Document
s3BucketName = "textractanalysisexample"
documentName = "sheet_example.pdf"
def startJob(s3BucketName, objectName):
response = None
client = boto3.client('textract')
response = client.start_document_text_detection(
DocumentLocation={
'S3Object': {
'Bucket': s3BucketName,
'Name': objectName
}
})
return response["JobId"]
def isJobComplete(jobId):
# For production use cases, use SNS based notification
# Details at: https://docs.aws.amazon.com/textract/latest/dg/api-async.html
time.sleep(5)
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
while(status == "IN_PROGRESS"):
time.sleep(5)
response = client.get_document_text_detection(JobId=jobId)
status = response["JobStatus"]
#print("Job status: {}".format(status))
return status
def getJobResults(jobId):
pages = []
client = boto3.client('textract')
response = client.get_document_text_detection(JobId=jobId)
pages.append(response)
print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
while(nextToken):
response = client.get_document_text_detection(JobId=jobId, NextToken=nextToken)
pages.append(response)
#print("Resultset page recieved: {}".format(len(pages)))
nextToken = None
if('NextToken' in response):
nextToken = response['NextToken']
return pages
def lambda_handler(event, context):
jobId = startJob(s3BucketName, documentName)
#print("Started job with id: {}".format(jobId))
if(isJobComplete(jobId)):
response = getJobResults(jobId)
# Print detected text
for resultPage in response:
for item in resultPage["Blocks"]:
if item["BlockType"] == "LINE":
print (item["Text"]) ```
You can import CSV to write to a csv file like so:
import csv
with open('my_pdf.txt', 'r') as in_file:
stripped = (line.strip() for line in in_file)
lines = (line.split(",") for line in stripped if line)
with open('my_pdf.csv', 'w') as out_file:
writer = csv.writer(out_file)
writer.writerow(('title', 'intro'))
writer.writerows(lines)
You can just put in the rows you need, and this splits your data into comma separated values. You can see more information for CSV writer (and csv python in general) here (Python Docs).

Specify outpath when downloading files from URL

I have some files I am downloading from a url.
I can currently access my files like this:
import requests
from bs4 import BeautifulSoup
import os
prefix = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
download_url = "https:/path_to_website"
s = requests.session()
soup = BeautifulSoup(s.get(download_url).text, "lxml")
for a in soup.find_all('a', href=True):
final_link = os.path.join(prefix, a['href'])
result = s.get(final_link, stream = True)
with open(a['href'], 'wb') as out_file:
shutil.copyfileobj(result.raw, out_file)
This will download the files fine and puts it into a default directory of C:/User.
I would like to choose where to download my files though. You can choose where the outpath is with wget but my method with that downloads empty files as if they aren't being accessed.
I tried this with wget like this:
out_path = "C:/my_path"
prefix = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
s = requests.session()
soup = BeautifulSoup(s.get(download_url).text, "lxml")
for a in page.find_all('a', href=True):
final_link = os.path.join(prefix, a['href'])
download = wget.download(final_link, out = out_path)
I think wget isn't working because I am accessing the website with authentication (not shown), and when I join the final link I am no longer accessing it with authentication. Is there a way to specify the outpath with shutil?
What about using the first method, replacing the path of the file opened with os.path.join(out_path, a['href']) ?
import requests
from bs4 import BeautifulSoup
import os
out_path = "C:\\my_path"
prefix = 'https://n5eil01u.ecs.nsidc.org/MOST/MOD10A1.006/'
download_url = "https:/path_to_website"
s = requests.session()
soup = BeautifulSoup(s.get(download_url).text, "lxml")
for a in soup.find_all('a', href=True):
final_link = os.path.join(prefix, a['href'])
result = s.get(final_link, stream = True)
new_file_path = os.path.join(out_path, a['href'])
with open(new_file_path, 'wb') as out_file: # this will create the new file at new_file_path
shutil.copyfileobj(result.raw, out_file)
You can create target path like below,
target_path = r'c:\windows\temp'
with open(os.path.join(target_path, a['href']), 'wb') as out_file:
shutil.copyfileobj(result.raw, out_file)

Resources