Downloading large files (10GB +) fails when Python Athena (Token Expired) - python-3.x

I am still learning Python (3.6) and now working on AWS. I am trying to automate a process where in the user is running a query in Athena. The results for the query are being directed to an S3 bucket. From the S3, I need to pull the file into my local and then run some more analysis using legacy tools. All this is being done step by step manually, by first firing a query in Athena Query Editor.
The problem I am facing is that the file(s) will be larger than 10GB and the SAML profile token expires after 1 hour. I have read some documentation about auto refreshing the credentials, however, while the file in being downloaded, how to even implement a solution like that. I have put my code below (that's the closest I got to a successful run with about 10000 records).
Any suggestions/help is appreciated.
import boto3
from boto3.s3.transfer import TransferConfig
import pandas as pd
import time
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)
session=boto3.Session(profile_name='saml')
athena_client = session.client("athena")
query_response = athena_client.start_query_execution(
QueryString="SELECT * FROM TABLENAME WHERE=<condition>",
QueryExecutionContext={"Database": 'some_db'},
ResultConfiguration={
"OutputLocation": 's3://131653427868-heor-epi-workbench-results',
"EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
},
WorkGroup='myworkgroup'
)
print(query_response)
iteration = 30
temp_file_location: str = "C:\\Users\\<user>\\Desktop\\Python Projects\\tablename.csv"
while(iteration > 0):
iteration = iteration - 1
print(iteration)
query_response_id = athena_client.get_query_execution(QueryExecutionId=query_response['QueryExecutionId'])
print(query_response_id)
if (query_response_id['QueryExecution']['Status']['State'] == 'FAILED') or (query_response_id['QueryExecution']['Status']['State'] == 'CANCELLED'):
print("IF BLOCK: ", query_response_id['QueryExecution']['Status']['State'])
print("The Query Failed.")
elif (query_response_id['QueryExecution']['Status']['State'] == 'SUCCEEDED'):
print("ELSE IF BLOCK: ", query_response_id['QueryExecution']['Status']['State'])
print("Query Completed. Ready to download.")
print("Proceeding to Download File......")
config = TransferConfig(max_concurrency=5)
s3_client = session.client("s3")
s3_client.download_file('131653427868-heor-epi-workbench-results',
f"{query_response['QueryExecutionId']}.csv",
temp_file_location,
Config = config
)
print("Download complete. Setting Iteration to 0 to exit loop. ")
iteration = 0
else:
print("ELSE BLOCK: ", query_response_id['QueryExecution']['Status']['State'])
print(query_response_id['QueryExecution']['Status']['State'])
time.sleep(10)
pandasDF=pd.read_csv(temp_file_location)
print(pandasDF)

Related

Run databricks job from notebook

I want to know if it is possible to run a Databricks job from a notebook using code, and how to do it
I have a job with multiple tasks, and many contributors, and we have a job created to execute it all, now we want to run the job from a notebook to test new features without creating a new task in the job, also for running the job multiple times in a loop, for example:
for i in [1,2,3]:
run job with parameter i
Regards
what you need to do is the following:
install the databricksapi. %pip install databricksapi==1.8.1
Create your job and return an output. You can do that by exiting the notebooks like that:
import json dbutils.notebook.exit(json.dumps({"result": f"{_result}"}))
If you want to pass a dataframe, you have to pass them as json dump too, there is some official documentation about that from databricks. check it out.
Get the job id you will need it later. You can get it from the jobs details in databricks.
In the executors notebook you can use the following code.
def run_ks_job_and_return_output(params):
context = json.loads(dbutils.notebook.entry_point.getDbutils().notebook().getContext().toJson())
# context
url = context['extraContext']['api_url']
token = context['extraContext']['api_token']
jobs_instance = Jobs.Jobs(url, token) # initialize a jobs_instance
runs_job_id = jobs_instance.runJob(****************, 'notebook',
params) # **** is the job id
run_is_not_completed = True
while run_is_not_completed:
current_run = [run for run in jobs_instance.runsList('completed')['runs'] if run['run_id'] == runs_job_id['run_id'] and run['number_in_job'] == runs_job_id['number_in_job']]
if len(current_run) == 0:
time.sleep(30)
else:
run_is_not_completed = False
current_run = current_run[0]
print( f"Result state: {current_run['state']['result_state']}, You can check the resulted output in the following link: {current_run['run_page_url']}")
note_output = jobs_instance.runsGetOutput(runs_job_id['run_id'])['notebook_output']
return note_output
run_ks_job_and_return_output( { 'parm1' : 'george',
'variable': "values1"})
If you want to run the job many times in parallel you can do the following. (first be sure that you have increased the max concurent runs in the job settings)
from multiprocessing.pool import ThreadPool
pool = ThreadPool(1000)
results = pool.map(lambda j: run_ks_job_and_return_output( { 'table' : 'george',
'variable': "values1",
'j': j}),
[str(x) for x in range(2,len(snapshots_list))])
There is also the possibility to save the whole html output but maybe you are not interested on that. In any case I will answer to that to another post on StackOverflow.
Hope it helps.
You can use following steps :
Note-01:
dbutils.widgets.text("foo", "fooDefault", "fooEmptyLabel")
dbutils.widgets.text("foo2", "foo2Default", "foo2EmptyLabel")
result = dbutils.widgets.get("foo")+"-"+dbutils.widgets.get("foo2")
def display():
print("Function Display: "+result)
dbutils.notebook.exit(result)
Note-02:
thislist = ["apple", "banana", "cherry"]
for x in thislist:
dbutils.notebook.run("Note-01 path", 60, {"foo": x,"foo2":'Azure'})

How to automate an API to change the URL every hour and append the new data to a csv

I have successfully implemented an API that generates a unique URL to grab data from a database and downloads it into a csv. I am now attempting to automate this API so that it can generate the unique URL every hour and then append the csv file with the new data. I have no idea where to begin to automate this but the working API is pasted below so any help would be truly appreciated. Thank you.
import os
import sys
from datetime import datetime
from os.path import expanduser
import urllib.request
def main():
# API parameters
options = {}
options["url"] = "https://airnowapi.org/aq/data/"
options["start_date"] = "2020-01-01"
options["start_hour_utc"] = "01"
options["end_date"] = "2020-01-01"
options["end_hour_utc"] = "05"
options["parameters"] = "pm25"
options["bbox"] = "-76,38,-72,42"
options["data_type"] = "b"
options["format"] = "text/csv"
options["ext"] = "csv"
options["api_key"] = "" #NotIncludedforProtectionOfUniqueAPIkey
# API request URL
REQUEST_URL = options["url"] \
+ "?startdate=" + options["start_date"] \
+ "t" + options["start_hour_utc"] \
+ "&enddate=" + options["end_date"] \
+ "t" + options["end_hour_utc"] \
+ "&parameters=" + options["parameters"] \
+ "&bbox=" + options["bbox"] \
+ "&datatype=" + options["data_type"] \
+ "&format=" + options["format"] \
+ "&api_key=" + options["api_key"]
try:
# Request AirNowAPI data
print("Requesting AirNowAPI data...")
print(REQUEST_URL)
# User's home directory.
home_dir = expanduser("E:\SPRING2021\AIRNOWAPI\AIRNOWFILES")
download_file_name = "AirNowAPI" + datetime.now().strftime("_%Y%M%d%H%M%S." + options["ext"])
download_file = os.path.join(home_dir, download_file_name)
# Perform the AirNow API data request
api_data = urllib.request.URLopener()
api_data.retrieve(REQUEST_URL, download_file)
# Download complete
print ("Download URL: %s" % REQUEST_URL)
print("Download File: %s" % download_file)
except Exception as e:
print("Unable perform AirNowAPI request. %s" % e)
sys.exit(1)
if __name__ == "__main__":
main()
I find most of your code is well documented. There are many ways to automated your task. Here are the steps I would recommend you to do.
Create a Config file.
Try to separate your code from config(All the options data). You can even pickle it.
Make your code command line executable like python main.py config.yml, where you can pass config.file.
Checkpoint: Here your code should be one/multiple file and config is in another file.
Use a cronjob or any scheduler to trigger & step 3.
Shared Variables/Data?: If you have variables that need to be passed from first execution to another then you can use a static file, where you dump this data and using it for next interation

How to get the list of followers from an Instagram account without getting banned?

I am trying to scrape all the followers of some particular Instagram accounts. I am using Python 3.8.3 and the latest version of Instaloader library. The code I have written is given below:
# Import the required libraries:
import instaloader
import time
from random import randint
# Start time:
start = time.time()
# Create an instance of instaloader:
loader = instaloader.Instaloader()
# Credentials & target account:
user_id = USERID
password = PASSWORD
target = TARGET # Account of which the list of followers need to be scraped;
# Login or load the session:
loader.login(user_id, password)
# Obtain the profile metadata of the target:
profile = instaloader.Profile.from_username(loader.context, target)
# Print the list of followers and save it in a text file:
try:
# The list to store the collected user handles of the followers:
followers_list = []
# Variables used to apply pauses to slow down scraping:
count = 0
short_counter = 1
short_pauser = randint(19, 24)
long_counter = 1
long_pauser = randint(4900, 5000)
# Fetch the followers one by one:
for follower in profile.get_followers():
sleeper = randint(840, 1020)
# Short pause for the process:
if (short_counter % short_pauser == 0):
short_counter = 0
short_pauser = randint(19, 24)
print('\nShort Pause.\n')
time.sleep(1)
# Long pause for the process:
if (long_counter % long_pauser == 0):
long_counter = 0
long_pauser = randint(4900, 5000)
print('\nLong pause.\n')
time.sleep(sleeper)
# Append the list and print the follower's user handle:
followers_list.append(follower.username)
print(count,'', followers_list[count])
# Increment the counters accordingly:
count = count + 1
short_counter = short_counter + 1
long_counter = long_counter + 1
# Store the followers list in a txt file:
txt_file = target + '.txt'
with open(txt_file, 'a+') as f:
for the_follower in followers_list:
f.write(the_follower)
f.write('\n')
except Exception as e:
print(e)
# End time:
end = time.time()
total_time = end - start
# Print the time taken for execution:
print('Time taken for complete execution:', total_time,'s.')
I am getting the following error after scraping some data:
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
HTTP Error 400 (Bad Request) on GraphQL Query. Retrying with shorter page length.
400 Bad Request
In fact, the error occurs when Instagram detects unusual activity and disables the account for a while and prompts the user to change the password.
I have tried -
(1) Slowing down the process of scraping.
(2) Adding pauses in between in order to make the program more human-like.
Still, no progress.
How to bypass such restrictions and get the complete list of all the followers?
If getting the entire list is not possible, what is the best way to get at least 20,000 followers list (from multiple accounts) without getting banned / disabled account / facing such inconveniences?

How to avoid header while exporting BigQuery table in to Google Storage

I have developed below code which is helping to export BigQuery table in to Google storage bucket. I want to merge files into single file with out header, so that next processes will use file with out any issue.
def export_bq_table_to_gcs(self, table_name):
client = bigquery.Client(project=project_name)
print("Exporting table {}".format(table_name))
dataset_ref = client.dataset(dataset_name,
project=project_name)
dataset = bigquery.Dataset(dataset_ref)
table_ref = dataset.table(table_name)
size_bytes = client.get_table(table_ref).num_bytes
# For tables bigger than 1GB uses Google auto split, otherwise export is forced in a single file.
if size_bytes > 10 ** 9:
destination_uris = [
'gs://{}/{}{}*.csv'.format(bucket_name,
f'{table_name}_temp', uid)]
else:
destination_uris = [
'gs://{}/{}{}.csv'.format(bucket_name,
f'{table_name}_temp', uid)]
extract_job = client.extract_table(table_ref, destination_uris) # API request
result = extract_job.result() # Waits for job to complete.
if result.state != 'DONE' or result.errors:
raise Exception('Failed extract job {} for table {}'.format(result.job_id, table_name))
else:
print('BQ table(s) export completed successfully')
storage_client = storage.Client(project=gs_project_name)
bucket = storage_client.get_bucket(gs_bucket_name)
blob_list = bucket.list_blobs(prefix=f'{table_name}_temp')
print('Merging shard files into single file')
bucket.blob(f'{table_name}.csv').compose(blob_list)
Can you please help me to find a way to skip header.
Thanks,
Raghunath.
We can avoid header by using jobConfig to set the print_header parameter to False. Sample code
job_config = bigquery.job.ExtractJobConfig(print_header=False)
extract_job = client.extract_table(table_ref, destination_uris,
job_config=job_config)
Thanks
You can use skipLeadingRows (https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#externalDataConfiguration.googleSheetsOptions.skipLeadingRows)

Selenium (Python) - waiting for a download process to complete using Chrome web driver

I'm using selenium and python via chromewebdriver (windows) in order to automate a task of downloading large amount of files from different pages.
My code works, but the solution is far from ideal: the function below clicks on the website button that initiating a java script function that generating a PDF file and then downloading it.
I had to use a static wait in order to wait for the download to be completed (ugly) I cannot check the file system in order to verify when the download is completed since i'm using multi threading (downloading lot's of files from different pages at once) and also the the name of the files is generated dynamically in the website itself.
My code:
def file_download(num, drivervar):
Counter += 1
try:
drivervar.get(url[num])
download_button = WebDriverWait(drivervar, 20).until(EC.element_to_be_clickable((By.ID, 'download button ID')))
download_button.click()
time.sleep(10)
except TimeoutException: # Retry once
print('Timeout in thread number: ' + str(num) + ', retrying...')
.....
Is it possible to determine download completion in webdriver? I want to avoid using time.sleep(x).
Thanks a lot.
You can get the status of each download by visiting chrome://downloads/ with the driver.
To wait for all the downloads to finish and to list all the paths:
def every_downloads_chrome(driver):
if not driver.current_url.startswith("chrome://downloads"):
driver.get("chrome://downloads/")
return driver.execute_script("""
var items = document.querySelector('downloads-manager')
.shadowRoot.getElementById('downloadsList').items;
if (items.every(e => e.state === "COMPLETE"))
return items.map(e => e.fileUrl || e.file_url);
""")
# waits for all the files to be completed and returns the paths
paths = WebDriverWait(driver, 120, 1).until(every_downloads_chrome)
print(paths)
Was updated to support changes till version 81.
I have had the same problem and found a solution. You can check weither or not a .crdownload is in your download folder. If there are 0 instances of a file with .crdownload extension in the download folder then all your downloads are completed. This only works for chrome and chromium i think.
def downloads_done():
while True:
for filename in os.listdir("/downloads"):
if ".crdownload" in i:
time.sleep(0.5)
downloads_done()
Whenever you call downloads_done() it will loop itself untill all downloads are completed. If you are downloading massive files like 80 gigabytes then i don't recommend this because then the function can reach maximum recursion depth.
2020 edit:
def wait_for_downloads():
print("Waiting for downloads", end="")
while any([filename.endswith(".crdownload") for filename in
os.listdir("/downloads")]):
time.sleep(2)
print(".", end="")
print("done!")
The "end" keyword argument in print() usually holds a newline but we replace it.
While there are no filenames in the /downloads folder that end with .crdownload
sleep for 2 seconds and print one dot without newline to console
I don't really recommend using selenium anymore after finding out about requests but if it's a very heavily guarded site with cloudflare and captchas etc then you might have to resort to selenium.
With Chrome 80, I had to change the answer from #florent-b by the code below:
def every_downloads_chrome(driver):
if not driver.current_url.startswith("chrome://downloads"):
driver.get("chrome://downloads/")
return driver.execute_script("""
return document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items.filter(e => e.state === 'COMPLETE')
.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
I believe this is retro-compatible, I mean this shall be working with older versions of Chrome.
There are issues with opening chrome://downloads/ when running Chrome in headless mode.
The following function uses a composite approach that works whether the mode is headless or not, choosing the better approach available in each mode.
It assumes that the caller clears all files downloaded at file_download_path after each call to this function.
import os
import logging
from selenium.webdriver.support.ui import WebDriverWait
def wait_for_downloads(driver, file_download_path, headless=False, num_files=1):
max_delay = 60
interval_delay = 0.5
if headless:
total_delay = 0
done = False
while not done and total_delay < max_delay:
files = os.listdir(file_download_path)
# Remove system files if present: Mac adds the .DS_Store file
if '.DS_Store' in files:
files.remove('.DS_Store')
if len(files) == num_files and not [f for f in files if f.endswith('.crdownload')]:
done = True
else:
total_delay += interval_delay
time.sleep(interval_delay)
if not done:
logging.error("File(s) couldn't be downloaded")
else:
def all_downloads_completed(driver, num_files):
return driver.execute_script("""
var items = document.querySelector('downloads-manager').shadowRoot.querySelector('#downloadsList').items;
var i;
var done = false;
var count = 0;
for (i = 0; i < items.length; i++) {
if (items[i].state === 'COMPLETE') {count++;}
}
if (count === %d) {done = true;}
return done;
""" % (num_files))
driver.execute_script("window.open();")
driver.switch_to_window(driver.window_handles[1])
driver.get('chrome://downloads/')
# Wait for downloads to complete
WebDriverWait(driver, max_delay, interval_delay).until(lambda d: all_downloads_completed(d, num_files))
# Clear all downloads from chrome://downloads/
driver.execute_script("""
document.querySelector('downloads-manager').shadowRoot
.querySelector('#toolbar').shadowRoot
.querySelector('#moreActionsMenu')
.querySelector('button.clear-all').click()
""")
driver.close()
driver.switch_to_window(driver.window_handles[0])
import os
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
class MySeleniumTests(unittest.TestCase):
selenium = None
#classmethod
def setUpClass(cls):
cls.selenium = webdriver.Firefox(...)
...
def test_download(self):
os.chdir(self.download_path) # default download directory
# click the button
self.selenium.get(...)
self.selenium.find_element_by_xpath(...).click()
# waiting server for finishing inner task
def download_begin(driver):
if len(os.listdir()) == 0:
time.sleep(0.5)
return False
else:
return True
WebDriverWait(self.selenium, 120).until(download_begin) # the max wating time is 120s
# waiting server for finishing sending.
# if size of directory is changing,wait
def download_complete(driver):
sum_before=-1
sum_after=sum([os.stat(file).st_size for file in os.listdir()])
while sum_before != sum_after:
time.sleep(0.2)
sum_before = sum_after
sum_after = sum([os.stat(file).st_size for file in os.listdir()])
return True
WebDriverWait(self.selenium, 120).until(download_complete) # the max wating time is 120s
You must do these thing
Wait for server to finish inner business( for example, query from database).
Wait for server to finish sending the files.
(my English is not very well)
To obtain the return of more than one item, I had to change the answer of #thdox by the code below:
def every_downloads_chrome(driver):
if not driver.current_url.startswith("chrome://downloads"):
driver.get("chrome://downloads/")
return driver.execute_script("""
var elements = document.querySelector('downloads-manager')
.shadowRoot.querySelector('#downloadsList')
.items
if (elements.every(e => e.state === 'COMPLETE'))
return elements.map(e => e.filePath || e.file_path || e.fileUrl || e.file_url);
""")
This may not work for all usecases but for my simple need to wait for one pdf to download it works great. Based off of Walter's comment above.
def get_non_temp_len(download_dir):
non_temp_files = [i for i in os.listdir(download_dir) if not (i.endswith('.tmp') or i.endswith('.crdownload'))]
return len(non_temp_files)
download_dir = 'your/download/dir'
original_count = get_non_temp_len(download_dir) # get the file count at the start
# do your selenium stuff
while original_count == get_non_temp_len(download_dir):
time.sleep(.5) # wait for file count to change
driver.quit()
I had the same problem and this method worked for me.
import time
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import ElementClickInterceptedException
from threading import Thread
import os
import datetime
def checkFilePresence(downloadPath, numberOfFilesInitially, artistName,
songTitle):
timeNow = datetime.datetime.now()
found = False
while not found:
numberOfFilesNow = len(os.listdir(downloadPath))
if numberOfFilesNow > numberOfFilesInitially:
for folders, subfolders, files in os.walk(downloadPath):
for file in files:
modificationTime = datetime.datetime.fromtimestamp\
(os.path.getctime(os.path.join(folders, file)))
if modificationTime > timeNow:
if file.endswith('.mp3'):
return
This code work in headless mode and return downloaded file name (based on
#protonum code):
def wait_for_downloads(download_path):
max_delay = 30
interval_delay = 0.5
total_delay = 0
file = ''
done = False
while not done and total_delay < max_delay:
files = [f for f in os.listdir(download_path) if f.endswith('.crdownload')]
if not files and len(file) > 1:
done = True
if files:
file = files[0]
time.sleep(interval_delay)
total_delay += interval_delay
if not done:
logging.error("File(s) couldn't be downloaded")
return download_path + '/' + file.replace(".crdownload", "")
def wait_for_download_to_be_don(self, path_to_folder, file_name):
max_time = 60
counter = 0
while not os.path.exists(path_to_folder + file_name) and time_counter < max_time:
sleep(0.5)
time_counter += 0.5
if time_counter == max_time:
assert os.path.exists(path_to_folder + file_name), "The file wasn't downloaded"
When using test automation, its crucial that developers make the software testable. It is your job to check the software combined with the testability, meaning that you need to request a spinner or a simple HTML tag which indicates when the download is done successfully.
In a case as yours, where you cannot check it in the UI and you cannot check in system, this is the best way to solve it.

Resources