Error with Bulkloading from CSV to PostGres - psycopg2

import pandas
import pygrametl
import psycopg2
from pygrametl.tables import SlowlyChangingDimension,CachedDimension,BulkDimension
from pygrametl.datasources import CSVSource
##Connection to PostGres
connection = psycopg2.connect(host="localhost",database="postgres", user="postgres",
password="tekihcan")
connect = pygrametl.ConnectionWrapper(connection)
def pgcopybulkloader(name, atts, fieldsep, rowsep, nullval, filehandle):
# Here we use driver-specific code to get fast bulk loading.
# You can change this method if you use another driver or you can
# use the FactTable or BatchFactTable classes (which don't require
# use of driver-specifc code) instead of the BulkFactTable class.
global connection
curs = connect.cursor()
try:
curs.copy_from(file=filehandle, table=name, sep=fieldsep,
columns=atts,null='null')
except(Exception, psycopg2.Database) as error:
print("Error %s" % error)
date_dim = BulkDimension(name='date_dim',key='d_date_sk',attributes=[
'd_date_id (B)'
,'d_date'
,'d_month_seq'
,'d_week_seq'
,'d_quarter_seq'
,'d_year'
,'d_dow'
,'d_moy'
,'d_dom'
,'d_qoy'
,'d_fy_year'
,'d_fy_quarter_seq'
,'d_fy_week_seq'
,'d_day_name'
,'d_quarter_name'
,'d_holiday'
,'d_weekend'
,'d_following_holiday'
,'d_first_dom'
,'d_last_dom'
,'d_same_day_ly'
,'d_same_day_lq'
,'d_current_day'
,'d_current_week'
,'d_current_month'
,'d_current_quarter'
,'d_current_year'
],lookupatts = ['d_date_id (B)'],
bulkloader = pgcopybulkloader)
date_dim_source = CSVSource(open('C:/Users/HP\Documents/v2.13.0rc1/data/date_dim.csv',
'r', 16384),delimiter='|')
def main():
for row in date_dim_source:
date_dim.insert(row)
The code is failing with error -
As per my understanding the error is caused because the target table is empty. The CSV source doesn't have header as well. Could this be impacting the code?
Please find the link that was used to develop the code - https://chrthomsen.github.io/pygrametl/

Related

Not able to retrieve tick data - Zerodha Web Socket

I am using the below code to connect to the Web Socket API of Zerodha to pull tick data information for a particular instrument that I am interested in. When I run the below piece of code, I am not able to pull any information. I am not sure whether I am calling the functions in the Streaming_Ticks class properly.
The instrument token, which is the input, is placed in the "parameter_file.csv" and this token needs to be passed on to on_connect callback function present inside Streaming_Ticks class.
Would welcome your comments on how to run this code correctly.
from kiteconnect import KiteConnect
from kiteconnect import KiteTicker
import os
import csv
#cwd = os.chdir("E:\\Algorthmic Trading\\Zerodha_Training")
class Streaming_Ticks:
def __init__(self):
access_token = open("access_token.txt",'r').read()
key_secret = open("key_info.txt",'r').read().split()
self.kite = KiteConnect(api_key=key_secret[0])
self.kite.set_access_token(access_token)
self.kws = KiteTicker(key_secret[0],self.kite.access_token)
def on_ticks(ws,ticks):
# Callback to receive ticks.
#logging.debug("Ticks: {}".format(ticks))
print(ticks)
def on_connect(ws,response):
# Callback on successful connect.
# Subscribe to a list of instrument_tokens (RELIANCE and ACC here).
#logging.debug("on connect: {}".format(response))
print(token_list)
ws.subscribe(token_list)
ws.set_mode(ws.MODE_FULL,token_list) # Set all token tick in `full` mode.
#ws.set_mode(ws.MODE_FULL,[tokens[0]]) # Set one token tick in `full` mode.
if __name__ == "__main__":
cwd = os.chdir("E:\\Algorthmic Trading\\Zerodha_Training")
tick_data = Streaming_Ticks()
token_list= []
with open('parameter_file.csv') as param_file:
param_reader = csv.DictReader(param_file, )
for row in param_reader:
token_list.append(int(row['token']))
tick_data.on_ticks = tick_data.on_ticks
tick_data.on_connect = tick_data.on_connect
tick_data.kws.connect()

Google cloud function (python) does not deploy - Function failed on loading user code

I'm calling a simple python function in google cloud but cannot get it to save. It shows this error:
"Function failed on loading user code. This is likely due to a bug in the user code. Error message: Error: please examine your function logs to see the error cause: https://cloud.google.com/functions/docs/monitoring/logging#viewing_logs. Additional troubleshooting documentation can be found at https://cloud.google.com/functions/docs/troubleshooting#logging. Please visit https://cloud.google.com/functions/docs/troubleshooting for in-depth troubleshooting documentation."
Logs don't seem to show much that would indicate error in the code. I followed this guide: https://blog.thereportapi.com/automate-a-daily-etl-of-currency-rates-into-bigquery/
With the only difference environment variables and the endpoint I'm using.
Code is below, which is just a get request followed by a push of data into a table.
import requests
import json
import time;
import os;
from google.cloud import bigquery
# Set any default values for these variables if they are not found from Environment variables
PROJECT_ID = os.environ.get("PROJECT_ID", "xxxxxxxxxxxxxx")
EXCHANGERATESAPI_KEY = os.environ.get("EXCHANGERATESAPI_KEY", "xxxxxxxxxxxxxxx")
REGIONAL_ENDPOINT = os.environ.get("REGIONAL_ENDPOINT", "europe-west1")
DATASET_ID = os.environ.get("DATASET_ID", "currency_rates")
TABLE_NAME = os.environ.get("TABLE_NAME", "currency_rates")
BASE_CURRENCY = os.environ.get("BASE_CURRENCY", "SEK")
SYMBOLS = os.environ.get("SYMBOLS", "NOK,EUR,USD,GBP")
def hello_world(request):
latest_response = get_latest_currency_rates();
write_to_bq(latest_response)
return "Success"
def get_latest_currency_rates():
PARAMS={'access_key': EXCHANGERATESAPI_KEY , 'symbols': SYMBOLS, 'base': BASE_CURRENCY}
response = requests.get("https://api.exchangeratesapi.io/v1/latest", params=PARAMS)
print(response.json())
return response.json()
def write_to_bq(response):
# Instantiates a client
bigquery_client = bigquery.Client(project=PROJECT_ID)
# Prepares a reference to the dataset
dataset_ref = bigquery_client.dataset(DATASET_ID)
table_ref = dataset_ref.table(TABLE_NAME)
table = bigquery_client.get_table(table_ref)
# get the current timestamp so we know how fresh the data is
timestamp = time.time()
jsondump = json.dumps(response) #Returns a string
# Ensure the Response is a String not JSON
rows_to_insert = [{"timestamp":timestamp,"data":jsondump}]
errors = bigquery_client.insert_rows(table, rows_to_insert) # API request
print(errors)
assert errors == []
I tried just the part that does the get request with an offline editor and I can confirm a response works fine. I suspect it might have to do something with permissions or the way the script tries to access the database.

is there a way to convert a python script into one class or a package?

I am migrating data from IBM to Snowflake in
3 stages- extract, transform and load.
Below is the python code that connects source IBM and destination Snowflake which does the ETL.
is there any way I can create a class/ package out of the entire below code?
import snowflake.connector
tableName='F58001'
ctx = snowflake.connector.connect(
user='*',
password='*',
account='*.azure'
)
cs = ctx.cursor()
ctx.cursor().execute("USE DATABASE STORE_PROFILE")
ctx.cursor().execute("USE SCHEMA LANDING")
try:
ctx.cursor().execute("PUT file:///temp/data/{tableName}/* #%{tableName}".format(tableName=tableName))
except Exception:
pass
ctx.cursor().execute("truncate table {tableName}".format(tableName=tableName))
ctx.cursor().execute("COPY INTO {tableName} ON_ERROR = 'CONTINUE' ".format(tableName=tableName,
FIELD_OPTIONALLY_ENCLOSED_BY = '""', sometimes=',', ERROR_ON_COLUMN_COUNT_MISMATCH = 'TRUE'))
last_query_id= ctx.cursor().execute("select last_query_id()")
for res in last_query_id:
query_id = res[0]
ctx.cursor().execute(f"create or replace table save_copy_errors as select * from
table(validate("+tableName+", job_id=> "+"'"+query_id+"'"+"))")
ax = ctx.cursor().execute("select * from save_copy_errors")
for errors in ax:
error = errors
print(error)
ctx.close()
Please look at the below repository. It probably has answer to your question. I am currently working on moving it to PYPI so that it can be installed with PIP
https://github.com/Infosys/Snowflake-Python-Development-Framework

scrapy cralwer cant parse data from mysql database

I build a web cralwer with scrapy and I store data to mysql database (I crawl the source code from a url) and now I would like to do offline edit . So I have create sql queries to export data with python and the I try to crawl from this.
Could you please suggest how to do it. I can't do it with scrapy of actually. I failed to do it with scrapy, if anyone has anys uggestion or similar project and can help me out .
I have tried with scrapy query the database and store data to
from scrapy.http import HtmlResponse
import mysql
from mysql.connector import Error
import scrapy
import re
import requests
from bs4 import BeautifulSoup
# this here connect to database and q all url that have been crawled and store it into records.
class database:
def query():
try:
connection = mysql.connector.connect(host='',
database='',
user='',
password='')
cursor = connection.cursor(prepared=True)
if connection.is_connected():
db_Info = connection.get_server_info()
done = "Connected to MySQL database... MySQL Server version on "
sql_select_Query = """ SELECT `job_url`, `job_description` FROM `store_all` WHERE job_url LIKE '%kariera.gr% """
cursor = connection.cursor()
cursor.execute(sql_select_Query)
records = cursor.fetchall()
except mysql.connector.Error as error:
not_done = "Failed to connect {}".format(error)
return records
def insert(job_url, metakey, metavalue):
try:
connection = mysql.connector.connect(host='',
database='',
user='',
password='')
cursor = connection.cursor(prepared=True)
sql_insert_query = """ INSERT INTO `store`( `url`, `metakey`, `metavalue`, ) VALUES (%s,%s,%s)"""
insert_tuple = (job_url, metakey, metavalue)
result = cursor.execute(sql_insert_query, insert_tuple)
connection.commit()
done = "Record inserted successfully into python_users table"
except mysql.connector.Error as error:
connection.rollback()
not_done = "Failed to insert into MySQL table {}".format(error)
return done
class Crawler(scrapy.Spider,database):
records =database.query()
records=records[0]
response = HtmlResponse(url="Any String", body=records,encoding='utf-8')
job=response.xpath('//ul[#class="tab_content"]/text()').extract()
url= records
metakey= "test"
metavalue= "test"
print(database.query())
print(database.insert(url,metakey,metavalue))
the issue is solved actually
b = ''.join(body1)
response = TextResponse(url="Any String", body=b,encoding='utf-8')

CouchDB change Database directory

I am trying to change the directory of the couch database. I am using a python script to import a csv file to the CouchDB. Script is running ok. Here it is just in case:
from couchdbkit import Server, Database
from couchdbkit.loaders import FileSystemDocsLoader
from csv import DictReader
import sys, subprocess, math, os
def parseDoc(doc):
for k,v in doc.items():
if (isinstance(v,str)):
#print k, v, v.isdigit()
# #see if this string is really an int or a float
if v.isdigit()==True: #int
doc[k] = int(v)
else: #try a float
try:
if math.isnan(float(v))==False:
doc[k] = float(v)
except:
pass
return doc
def upload(db, docs):
db.bulk_save(docs)
del docs
return list()
def uploadFile(fname, dbname):
#connect to the db
theServer = Server()
db = theServer.get_or_create_db(dbname)
#loop on file for upload
reader = DictReader(open(fname, 'rU'), dialect = 'excel')
docs = list()
checkpoint = 100
i = 0
for doc in reader:
newdoc = parseDoc(doc)
docs.append(newdoc)
if len(docs)%checkpoint==0:
docs = upload(db,docs)
i += 1
print 'Number : %d' %i
#don't forget the last batch
docs = upload(db,docs)
if __name__=='__main__':
x = '/media/volume1/Crimes_-_2001_to_present.csv'
filename = x
dbname = 'test'
uploadFile(filename, dbname)
I saw plenty posts on how to change the directory for appending the database. If I leave the /etc/couchdb/local.ini as it is (original after installation) the script is appending data to the default directory /var/lib/couchdb/1.0.1/. When I modify the local.ini to store the database to another disk:
database_dir = /media/volume1
view_index_dir = /media/volume1
and after the reboot of the CouchDB service I get this error :
restkit.errors.RequestError: socket.error: [Errno 111] Connection refused
I have checked the open sockets (couchdb uses 5984 as default) and it is not opened. But I get no errors when I start CouchDB service.
Any ideas how to fix it ?
I think the error may be due to you have changed the directory location in Local.ini but when you are trying to make new connection to existing database, it cannot find it there.
So move the database_name.couch file to new location which you can put in local.ini and then try to make a connection. I think this should work.

Resources