I hope somebody can help me to debug this issue.
I have the following script
from azure.cosmosdb.table.tableservice import TableService,ListGenerator
from azure.storage.blob import BlobServiceClient
from datetime import date
from datetime import *
def queryAndSaveAllDataBySize(tb_name,resp_data:ListGenerator ,table_out:TableService,table_in:TableService,query_size:int):
for item in resp_data:
#remove etag and Timestamp appended by table service
del item.etag
del item.Timestamp
print("instet data:" + str(item) + "into table:"+ tb_name)
table_in.insert_or_replace_entity(tb_name,item)
if resp_data.next_marker:
data = table_out.query_entities(table_name=tb_name,num_results=query_size,marker=resp_data.next_marker)
queryAndSaveAllDataBySize(tb_name,data,table_out,table_in,query_size)
tbs_out = table_service_out.list_tables()
for tb in tbs_out:
#create table with same name in storage2
table_service_in.create_table(table_name=tb.name, fail_on_exist=False)
#first query
data = table_service_out.query_entities(tb.name,num_results=query_size)
queryAndSaveAllDataBySize(tb.name,data,table_service_out,table_service_in,query_size)
this code will check the table in storageA copy them and create the same table in StorageB, and thanks to the marker I can have the x_ms_continuation token if I have more than 1000 rows per requests.
Goes without saying that this works just fine as it is.
But yesterday I was trying to make some changes to the code as follow:
If in storageA I have a table name TEST, I storageB I want to create a table named TEST20210930, basically the table name from storageA + today date
This is where the code start breaking down.
table_service_out = TableService(account_name='', account_key='')
table_service_in = TableService(account_name='', account_key='')
query_size = 100
#save data to storage2 and check if there is lefted data in current table,if yes recurrence
def queryAndSaveAllDataBySize(tb_name,resp_data:ListGenerator ,table_out:TableService,table_in:TableService,query_size:int):
for item in resp_data:
#remove etag and Timestamp appended by table service
del item.etag
del item.Timestamp
print("instet data:" + str(item) + "into table:"+ tb_name)
table_in.insert_or_replace_entity(tb_name,item)
if resp_data.next_marker:
data = table_out.query_entities(table_name=tb_name,num_results=query_size,marker=resp_data.next_marker)
queryAndSaveAllDataBySize(tb_name,data,table_out,table_in,query_size)
tbs_out = table_service_out.list_tables()
print(tbs_out)
for tb in tbs_out:
table = tb.name + today
print(target_connection_string)
#create table with same name in storage2
table_service_in.create_table(table_name=table, fail_on_exist=False)
#first query
data = table_service_out.query_entities(tb.name,num_results=query_size)
queryAndSaveAllDataBySize(table,data,table_service_out,table_service_in,query_size)
What happens here is that the code runs up to the query_size limit but than fails saying that the table was not found.
I am a bit confused here and maybe somebody can help to spot my error.
Please if you need more info just ask
Thank you so so so much.
HOW TO REPRODUCE:
In azure portal create 2 storage account. StorageA and StorageB.
In storage A create a table and fill it with data, over 100 (based on the query_size.
Set the configuration Endpoints. table_service_out = storageA and table_storage_in = StorageB
I believe the issue is with the following line of code:
data = table_out.query_entities(table_name=tb_name,num_results=query_size,marker=resp_data.next_marker)
If you notice, tb_name is the name of the table in your target account which is obviously not present in your source account. Because you're querying from a table that does not exist, you're getting this error.
To fix this, you should also pass the name of source table to queryAndSaveAllDataBySize and use that when querying entities in that function.
UPDATE
Please take a look at code below:
table_service_out = TableService(account_name='', account_key='')
table_service_in = TableService(account_name='', account_key='')
query_size = 100
#save data to storage2 and check if there is lefted data in current table,if yes recurrence
def queryAndSaveAllDataBySize(source_table_name, target_table_name,resp_data:ListGenerator ,table_out:TableService,table_in:TableService,query_size:int):
for item in resp_data:
#remove etag and Timestamp appended by table service
del item.etag
del item.Timestamp
print("instet data:" + str(item) + "into table:"+ tb_name)
table_in.insert_or_replace_entity(target_table_name,item)
if resp_data.next_marker:
data = table_out.query_entities(table_name=source_table_name,num_results=query_size,marker=resp_data.next_marker)
queryAndSaveAllDataBySize(source_table_name, target_table_name, data,table_out,table_in,query_size)
tbs_out = table_service_out.list_tables()
print(tbs_out)
for tb in tbs_out:
table = tb.name + today
print(target_connection_string)
#create table with same name in storage2
table_service_in.create_table(table_name=table, fail_on_exist=False)
#first query
data = table_service_out.query_entities(tb.name,num_results=query_size)
queryAndSaveAllDataBySize(tb.name, table,data,table_service_out,table_service_in,query_size)
Related
I have been using Python to read and write data to Snowflake for some time now to a table I have full update rights to using a Snowflake helper class my colleague found on the internet. Please see below for the class I have been using with my personal Snowflake connection information abstracted and a simply read query that works given you have a 'TEST' table in your schema.
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine
import keyring
import pandas as pd
from sqlalchemy import text
# Pull the username and password to be used to connect to snowflake
stored_username = keyring.get_password('my_username', 'username')
stored_password = keyring.get_password('my_password', 'password')
class SNOWDBHelper:
def __init__(self):
self.user = stored_username
self.password = stored_password
self.account = 'account'
self.authenticator = 'authenticator'
self.role = stored_username + '_DEV_ROLE'
self.warehouse = 'warehouse'
self.database = 'database'
self.schema = 'schema'
def __connect__(self):
self.url = URL(
user=stored_username,
password=stored_password,
account='account',
authenticator='authenticator',
role=stored_username + '_DEV_ROLE',
warehouse='warehouse',
database='database',
schema='schema'
)
# =============================================================================
self.url = URL(
user=self.user,
password=self.password,
account=self.account,
authenticator=self.authenticator,
role=self.role,
warehouse=self.warehouse,
database=self.database,
schema=self.schema
)
self.engine = create_engine(self.url)
self.connection = self.engine.connect()
def __disconnect__(self):
self.connection.close()
def read(self, sql):
self.__connect__()
result = pd.read_sql_query(sql, self.engine)
self.__disconnect__()
return result
def write(self, wdf, tablename):
self.__connect__()
wdf.to_sql(tablename.lower(), con=self.engine, if_exists='append', index=False)
self.__disconnect__()
# Initiate the SnowDBHelper()
SNOWDB = SNOWDBHelper()
query = """SELECT * FROM """ + 'TEST'
snow_table = SNOWDB.read(query)
I now have the need to update an existing Snowflake table and my colleague suggested I could use the read function to send the query containing the update SQL to my Snowflake table. So I adapted an update query I use successfully in the Snowflake UI to update tables and used the read function to send it to Snowflake. It actually tells me that the relevant rows in the table have been updated, but they have not. Please see below for update query I use to attempt to change a field "field" in "test" table to "X" and the success message I get back. Not thrilled with this hacky update attempt method overall (where the table update is a side effect of sorts??), but could someone please help with method to update within this framework?
# Query I actually store in file: '0-Query-Update-Effective-Dating.sql'
UPDATE "Database"."Schema"."Test" AS UP
SET UP.FIELD = 'X'
# Read the query in from file and utilize it
update_test = open('0-Query-Update-Effective-Dating.sql')
update_query = text(update_test.read())
SNOWDB.read(update_query)
# Returns message of updated rows, but no rows updated
number of rows updated number of multi-joined rows updated
0 316 0
SQL2Pandas | UPDATE row(s) in pandas
My question is somehow similiar to this one: How to save out in a new column the url which is reading pandas read_html() function?
I have a set of links that contain tables (4 tables each and I need only first three of them). The goal is to store the link of each table in the separate 'address' column.
links = ['www.link1.com', 'www.link2.com', ... , 'www.linkx.com']
details = []
for link in tqdm(links):
page = requests.get(link)
sauce = BeautifulSoup(page.content, 'lxml')
table = sauce.find_all('table')
# Only first 3 tables include data
for i in range(3):
details.append(pd.read_html(str(table))[i])
final_df = pd.concat(details, ignore_index=True)
final_df['address'] = link
time.sleep(2)
However, when I use this code, only the last link is assigned to every row in the 'address' column.
I'm probably missing a detail but spent last 2 hours figuring that out and simply can't make any progress - would really appreciate some help.
You are close to your goal - Add df['address'] in each iteration to your DataFrame before appending it to your list:
for i in table[:3]:
df = pd.read_html(str(i))[0]
df['address'] = link
details.append(df)
Note You could also slice your ResultSet of tables table[:3] so you do not have to use range
Move the concatination outside of your loop and call it ones if your iterations are over:
final_df = pd.concat(details, ignore_index=True)
Example
import pandas as pd
links = ['www.link1.com', 'www.link2.com','www.linkx.com']
details = []
for link in links:
# page = requests.get(link)
# sauce = BeautifulSoup(page.content, 'lxml')
# table = sauce.find_all('table')
table = ['<table><tr><td>table 1</td></tr></table>',
'<table><tr><td>table 2</td></tr></table>',
'<table><tr><td>table 3</td></tr></table>']
# Only first 3 tables include data
for i in table[:3]:
df = pd.read_html(str(i))[0]
df['address'] = link
details.append(df)
final_df = pd.concat(details, ignore_index=True)
Output
0
address
table 1
www.link1.com
table 2
www.link1.com
table 3
www.link1.com
table 1
www.link2.com
table 2
www.link2.com
table 3
www.link2.com
table 1
www.linkx.com
table 2
www.linkx.com
table 3
www.linkx.com
I want to export my data from Databricks to Azure blob. My Databricks commands select some pdf from my blob, run Form Recognizer and export the output results in my blob.
Here is my code:
%pip install azure.storage.blob
%pip install azure.ai.formrecognizer
from azure.storage.blob import ContainerClient
container_url = "https://mystorageaccount.blob.core.windows.net/pdf-raw"
container = ContainerClient.from_container_url(container_url)
for blob in container.list_blobs():
blob_url = container_url + "/" + blob.name
print(blob_url)
import requests
from azure.ai.formrecognizer import FormRecognizerClient
from azure.core.credentials import AzureKeyCredential
endpoint = "https://myendpoint.cognitiveservices.azure.com/"
key = "mykeynumber"
form_recognizer_client = FormRecognizerClient(endpoint, credential=AzureKeyCredential(key))
import pandas as pd
field_list = ["InvoiceDate","InvoiceID","Items","VendorName"]
df = pd.DataFrame(columns=field_list)
for blob in container.list_blobs():
blob_url = container_url + "/" + blob.name
poller = form_recognizer_client.begin_recognize_invoices_from_url(invoice_url=blob_url)
invoices = poller.result()
print("Scanning " + blob.name + "...")
for idx, invoice in enumerate(invoices):
single_df = pd.DataFrame(columns=field_list)
for field in field_list:
entry = invoice.fields.get(field)
if entry:
single_df[field] = [entry.value]
single_df['FileName'] = blob.name
df = df.append(single_df)
df = df.reset_index(drop=True)
df
account_name = "mystorageaccount"
account_key = "fs.azure.account.key." + account_name + ".blob.core.windows.net"
try:
dbutils.fs.mount(
source = "wasbs://pdf-recognized#mystorageaccount.blob.core.windows.net",
mount_point = "/mnt/pdf-recognized",
extra_configs = {account_key: dbutils.secrets.get(scope ="formrec", key="formreckey")} )
except:
print('Directory already mounted or error')
df.to_csv(r"/dbfs/mnt/pdf-recognized/output.csv", index=False)
The code runs fine until the very last line. I get the following error message:
Directory already mounted or error. FileNotFoundError: [Errno 2] No such file or directory: '/dbfs/mnt/pdf-recognized/output.csv'.
I tried using /dbfs:/ instead of /dbfs/ but I don't know what I am doing wrong.
How can I export my Databricks results to the blob?
Thank you
It looks like that you're trying to mount the storage that was already mounted. Really, mount operation should be done only once, and not done dynamically. You have several choices to implement it correctly:
unmount before mouting using the dbutils.fs.unmount("/mnt/pdf-recognized")
check if storage already mounted & only run mount if it's not mounted. Something like this (not tested)
mounts = [mount for mount in dbutils.fs.mounts()
if mount.mountPoint == "/mnt/pdf-recognized"]
if len(mounts) == 0:
dbutils.fs.mount(....)
you don't really need a mount - it has the "bad" property that anyone in the workspace can use it with permissions that was used for mounting. It could be just simpler to write results to local disk, and then copy file to necessary location using dbutils.fs.cp with wasbs protocol. Something like this:
df.to_csv(r"/tmp/my-output.csv", index=False)
spark.conf.set(account_key, dbutils.secrets.get(scope ="formrec", key="formreckey"))
dbutils.fs.cp("file:///tmp/my-output.csv"),
"wasbs://pdf-recognized#mystorageaccount.blob.core.windows.net/output.csv")
I need help to get through this workflow.
I have 2 storage accounts which I name storage1 and storage2
storage1 contrains a list of tables with some data in, and I would like to loop through all those tables, copy their content into storage2. I tried with azCopy but I had no luck as this feature is available only in azCopy v7.3 and I couldn't find this version for MacOs M1. The other option is Data factory but its too complex for what I want to achieve. So I decided to go with azure Python sdk.
As a library I am using azure.data.tables import TableServiceClient
The code I wrote looks like this:
from azure.data.tables import TableServiceClient
my_conn_str_out = 'storage1-Conn-Str'
table_service_client_out = TableServiceClient.from_connection_string(my_conn_str_out)
list_table = []
for table in table_service_client_out.list_tables():
list_table.append(table.table_name)
my_conn_str_in = 'Storage2-Conn-str'
table_service_client_in = TableServiceClient.from_connection_string(my_conn_str_in)
for new_tables in table_service_client_out.list_tables():
table_service_client_in.create_table_if_not_exists(new_tables.table_name)
print(f'tables created successfully {new_tables.table_name}')
this is how I structured my code.
for table in table_service_client_out.list_tables():
list_table.append(table.table_name)
I loop through all my tables in the storage account and append them into a list.
then:
for new_tables in table_service_client_out.list_tables():
table_service_client_in.create_table_if_not_exists(new_tables.table_name)
print(f'tables created successfully {new_tables.table_name}')
I create the same table in the storage2
So far everything works just fine.
What I would like to achieve now, is to query all the data in each table in storage1 and pass it to the respective table in storage2
According to Microsoft documentation I can achieve the query table using this:
query = table_service_client_out.query_tables(filter=table)
so I integrated this in my loop like this:
for table in table_service_client_out.list_tables():
query = table_service_client_out.query_tables(filter=table)
list_table.append(table.table_name)
print(query)
When I run my python code, I get back the memory allocation of the query and not the data in the tables:
<iterator object azure.core.paging.ItemPaged at 0x7fcd90c8fbb0>
<iterator object azure.core.paging.ItemPaged at 0x7fcd90c8f7f0>
<iterator object azure.core.paging.ItemPaged at 0x7fcd90c8fd60>
I was wondering if there is a way how I can query all the data in my tables and pass them to my storage2
Try this :
from azure.cosmosdb.table.tableservice import TableService,ListGenerator
table_service_out = TableService(account_name='', account_key='')
table_service_in = TableService(account_name='', account_key='')
#query 100 items per request, in case of consuming too much menory load all data in one time
query_size = 100
#save data to storage2 and check if there is lefted data in current table,if yes recurrence
def queryAndSaveAllDataBySize(tb_name,resp_data:ListGenerator ,table_out:TableService,table_in:TableService,query_size:int):
for item in resp_data:
#remove etag and Timestamp appended by table service
del item.etag
del item.Timestamp
print("instet data:" + str(item) + "into table:"+ tb_name)
table_in.insert_entity(tb_name,item)
if resp_data.next_marker:
data = table_out.query_entities(table_name=tb_name,num_results=query_size,marker=resp_data.next_marker)
queryAndSaveAllDataBySize(tb_name,data,table_out,table_in,query_size)
tbs_out = table_service_out.list_tables()
for tb in tbs_out:
#create table with same name in storage2
table_service_in.create_table(tb.name)
#first query
data = table_service_out.query_entities(tb.name,num_results=query_size)
queryAndSaveAllDataBySize(tb.name,data,table_service_out,table_service_in,query_size)
Of course, this is a simple demo for your requirement.For more efficiency, you can also query table data by partition key and commit them by batch
Let me know if you have any more questions.
I have two DynamoDB tables, I want to copy just 100 records out of 5000 to another table. Is there any way to do so? I currently have a script which copies the entire contents of the table but I just need a few records to be copied.
Does AWS have any way to do so? Here is the script that I am currently using:
import boto3
import os
import datetime
def lambda_handler(event, context):
from datetime import datetime
dynamotargetclient = boto3.client('dynamodb', region_name='****', ## Account key
aws_access_key_id='****',
aws_secret_access_key='****')
dynamoclient = boto3.client('dynamodb', region_name='***', ##UAT Account key
aws_access_key_id='****',
aws_secret_access_key='****')
#print ("Scanning and replicating Table Name : " + table)
dateTimeObj = datetime.now()
print (dateTimeObj)
dynamopaginator = dynamoclient.get_paginator('scan')
tabname= '****'
targettabname= '****'
dynamoresponse = dynamopaginator.paginate(
TableName=tabname,
Select='ALL_ATTRIBUTES',
ReturnConsumedCapacity='NONE',
ConsistentRead=True
)
for page in dynamoresponse:
for item in page['Items']:
dynamotargetclient.put_item(
TableName=targettabname,
Item=item
)
print ("Replication complete for Table Name : " + targettabname)
print (datetime.now())