It's my first time using sqlalchemy and pandas to insert some data into a clickhouse db.
When I try to insert some data using clickhouse cli it works fine, but when I tried to do the same thing using sqlalchemy I don't know why one row is missing.
Have I done something wrong?
import pandas as pd
# created the dataframe
engine = create_engine(uri)
session = make_session(engine)
metadata = MetaData(bind=engine)
metadata.reflect(bind = engine)
conn = engine.connect()
df.to_sql('test', conn, if_exists = 'append', index = False)
Let's try this way:
import pandas as pd
from infi.clickhouse_orm.engines import Memory
from infi.clickhouse_orm.fields import UInt16Field, StringField
from infi.clickhouse_orm.models import Model
from sqlalchemy import create_engine
# define the ClickHouse table schema
class Test_Humans(Model):
year = UInt16Field()
first_name = StringField()
engine = Memory()
engine = create_engine('clickhouse://default:#localhost/test')
# create table
with engine.connect() as conn:
conn.connection.create_table(Test_Humans) # https://github.com/Infinidat/infi.clickhouse_orm/blob/master/src/infi/clickhouse_orm/database.py#L142
pdf = pd.DataFrame.from_records([
{'year': 1994, 'first_name': 'Vova'},
{'year': 1995, 'first_name': 'Anja'},
{'year': 1996, 'first_name': 'Vasja'},
{'year': 1997, 'first_name': 'Petja'},
# ! sqlalchemy-clickhouse ignores the last item so add fake one
{}
])
pdf.to_sql('test_humans', engine, if_exists='append', index=False)
Take into account that sqlalchemy-clickhouse ignores the last item so add fake one (see source code and related issue 10).
Related
I want to get the columns of multiple tables in SAP HANA database. I am using hdbcli and it is giving error :
hdbcli.dbapi.Error: (362, 'invalid schema name: INFORMATION_SCHEMA: line 1 col 15
Python code :
import hdbcli
from hdbcli import dbapi
import pandas as pd
from google.cloud import bigquery
conn = dbapi.connect(
address="example.hana.trial-us10.hanacloud.ondemand.com",
port=443,
user='DBADMIN',
password='example#xxxxxx'
)
tables = ['table1', 'table2', 'table3']
for table in tables:
cursor = conn.cursor()
cursor.execute(f"SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE TABLE_NAME='{table}'")
print(f"Table '{table}' description:")
print([column[3] for column in cursor.fetchall()])
cursor.close()
conn.close()
need some help to proceed. Thanks
If I have a table on my server and I am producing a connection string to it, how can I, using Vaex, load it to a dataframe?
Here is what I am doing but with Pandas:
from sqlalchemy import types, create_engine, text
import pandas as pd
import pymysql
def connect_to_data(driver='mysql+pymysql://', conn_string=''):
try:
conn = create_engine(driver + conn_string)
print("MySQL Connection Successfull!")
except Exception as err:
print("MySQL Connection Failed!")
print(err)
return conn
# Connect to the db:
conn_string = 'xxxxxxxx'
conn = connect_to_data(conn_string=conn_string)
# Get all requests from the db:
query = '''SELECT * FROM table_name'''
result = conn.execute(text(query))
# Desired dataframe:
df = pd.read_sql_query(query, conn)
How can I do the same with Vaex (because of it's high performance)?
For now at least, you can't do it directly. But vaex can easily read a pandas dataframe so you can
# Following your example..
pandas_df = pd.read_sql_query(query, conn)
df = vaex.from_pandas(pandas_df)
By default, this code obtains all the closing days data or several Tickers:
tickers = ['SPY', 'QQQ', 'GLD ', 'EEM', 'IEMG', 'VTI', 'HYG', 'SJNK', 'USO']
ind_data = pd.DataFrame()
for t in tickers:
ind_data[t] = wb.DataReader(t,data_source='yahoo', start='2015-1-1')['Adj Close']
ind_data.to_excel('C:/Users/micka/Desktop/ETF.xlsx')
How do you add a parameter to Datareader in order to obtain weekly/monthly historical data instead? I tried using freq and interval but it doesn't work.
What if you try to replace this in your code for weakly data:
# Don't forget to import pandas_datareader exactly in this way
import pandas_datareader
# Then replace this in for loop
pandas_datareader.yahoo.daily.YahooDailyReader(t, interval='w' , start='2015-1-1').read()['Adj Close']
And this for monthly data:
# Don't forget to import pandas_datareader exactly in this way
import pandas_datareader
# Then replace this in for loop
pandas_datareader.yahoo.daily.YahooDailyReader(t, interval='m' , start='2015-1-1').read()['Adj Close']
Check official doc for further options. After executing this code for the weakly period:
import pandas_datareader
tickers = ['SPY', 'QQQ', 'GLD ', 'EEM', 'IEMG', 'VTI', 'HYG', 'SJNK', 'USO']
ind_data = pd.DataFrame()
for t in tickers:
ind_data[t] = pandas_datareader.yahoo.daily.YahooDailyReader(t, interval='w' , start='2015-1-1').read()['Adj Close']
ind_data
I got this:
How can I arrange bulk insert of python dataframe into corresponding azure SQL.
I see that INSERT works with individual records :
INSERT INTO XX ([Field1]) VALUES (value1);
How can I insert the entire content of dataframe into Azure table?
Thanks
According to my test, we also can use to_sql to insert data to Azure sql
for example
from urllib.parse import quote_plus
import numpy as np
import pandas as pd
from sqlalchemy import create_engine, event
import pyodbc
# azure sql connect tion string
conn ='Driver={ODBC Driver 17 for SQL Server};Server=tcp:<server name>.database.windows.net,1433;Database=<db name>;Uid=<user name>;Pwd=<password>;Encrypt=yes;TrustServerCertificate=no;Connection Timeout=30;'
quoted = quote_plus(conn)
engine=create_engine('mssql+pyodbc:///?odbc_connect={}'.format(quoted))
#event.listens_for(engine, 'before_cursor_execute')
def receive_before_cursor_execute(conn, cursor, statement, params, context, executemany):
print("FUNC call")
if executemany:
cursor.fast_executemany = True
#insert
table_name = 'Sales'
# For test, I use a csv file to create dataframe
df = pd.read_csv('D:\data.csv')
df.to_sql(table_name, engine, index=False, if_exists='replace', schema='dbo')
#test after inserting
query = 'SELECT * FROM {table}'.format(table=table_name )
dfsql = pd.read_sql(query, engine)
print(dfsql)
Trying to use LinuxAcademy posting of how to import Excel data into DynamoDB but the code posting is two years old and does not work. Any tips or suggestions would be very helpful.
Sorry I'm new to stackoverflow.
I was trying to take an excel spreadsheet and convert it to json then upload to DynamoDB like the posting on LinuxAcademy. The instructions are old and they use three scripts to upload one file.
Here is the code I used to create a lambda AWS python function.
The only problem is that it reads in the excel file and converts it to json and the file is too big to ingest into DynamoDB before the 5 minute timeout. I will probably convert it to step functions but this worked for me.
import boto3
import os
import sys
import uuid
import pandas as pd
s3_client = boto3.client('s3')
bucket = "serverless-record-storage-lambda"
def upload_to_dynamodb(report):
df=pd.read_excel(report)
df.columns=["APPLICATION", "FORM_NUMBER", "FILE_DATE", "STATUS_DATE", "STATUS", "STATUS_CODE", "EXPIRATION_DATE", "ESTIMATED COST", "REVISED_COST", "EXISTING_USE", "EXISTING_UNITS", "PROPOSED_USE","PROPOSED_UNITS","PLANSETS", "15_DAY_HOLD?" , "EXISTING_STORIES", "PROPOSED_STORIES", "ASSESSOR_STORIES", "VOLUNTARY", "PAGES", "BLOCK", "LOT", "STREET_NUMBER", "STREET_NUMBER_SFX", "AVS_STREET_NAME", "AVS_STREET_SFX", "UNIT", "UNIT_SFX", "FIRST_NAME", "LAST_NAME", "CONTRACTORPHONE",
"COMPANY_NAME", "STREET_NUMBER", "STREET", "STREET_SUFFIX", "CITY", "STATE", "ZIP_CODE", "CONTACT_NAME", "CONTACT_PHONE", "DESCRIPTION" ]
# Clean-up the data, change column types to strings to be on safer side :)
df=df.replace({'-': '0'}, regex=True)
df=df.fillna(0)
for i in df.columns:
df[i] = df[i].astype(str)
# Convert dataframe to list of dictionaries (JSON) that can be consumed by any no-sql database
myl=df.T.to_dict().values()
# Connect to DynamoDB using boto
resource = boto3.resource('dynamodb', region_name='us-west-2')
# Connect to the DynamoDB table
table = resource.Table('permitdata')
# Load the JSON object created in the step 3 using put_item method
for permit in myl:
table.put_item(Item=permit)
def handler(event, context):
for record in event['Records']:
print(record)
bucket = record['s3']['bucket']['name']
print(bucket)
key = record['s3']['object']['key']
print(key)
download_path = '/tmp/{}{}'.format(uuid.uuid4(), key)
upload_path = '/tmp/resized-{}'.format(key)
s3_client.download_file(bucket, key, download_path)
upload_to_dynamodb(download_path)
def main():
handler(event, None)
if __name__ == "__main__":
main()