Problem reading sql query in python 3 with pandas - python-3.x

Good morning.
I'm trying to read a SQL query with pandas through a SSH tunnel. It worked fine in python 2.7, but now, with python 3.7, it seems like the process paused when executing pd.read_sql_query. My code is the following:
def conect(lista, names):
# Block of code where I set used variables below.
while not success and retries < max_retries:
try:
print('Trying to connect ({n})...'.format(n = retries + 1))
with SSHTunnelForwarder((REMOTE_SERVER, 22),
ssh_username = user_name,
ssh_pkey = ssh_pkey,
ssh_private_key_password= password,
remote_bind_address=(str_number, PUERTO),
local_bind_address=('', PUERTO)) as tunnel:
engine = sqlalchemy.create_engine("postgresql+psycopg2://{user}:{passw}#{host}:{port}/{db}".format(
user = user_name,
passw = long_pass,
host = tunnel.local_bind_host,
port = tunnel.local_bind_port,
db = db))
dic_df = {name: pd.read_sql_query(query, engine) for query, name in zip(lista, names)}
return dic_df
except Exception as e:
print('Error...')
print(e)
retries += 1
I don't know if the problem comes from the engine or from the function pd.read_sql_query itself...
Many thanks in advance!!

Related

Downloading large files (10GB +) fails when Python Athena (Token Expired)

I am still learning Python (3.6) and now working on AWS. I am trying to automate a process where in the user is running a query in Athena. The results for the query are being directed to an S3 bucket. From the S3, I need to pull the file into my local and then run some more analysis using legacy tools. All this is being done step by step manually, by first firing a query in Athena Query Editor.
The problem I am facing is that the file(s) will be larger than 10GB and the SAML profile token expires after 1 hour. I have read some documentation about auto refreshing the credentials, however, while the file in being downloaded, how to even implement a solution like that. I have put my code below (that's the closest I got to a successful run with about 10000 records).
Any suggestions/help is appreciated.
import boto3
from boto3.s3.transfer import TransferConfig
import pandas as pd
import time
pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 100)
pd.set_option('display.width', 1000)
session=boto3.Session(profile_name='saml')
athena_client = session.client("athena")
query_response = athena_client.start_query_execution(
QueryString="SELECT * FROM TABLENAME WHERE=<condition>",
QueryExecutionContext={"Database": 'some_db'},
ResultConfiguration={
"OutputLocation": 's3://131653427868-heor-epi-workbench-results',
"EncryptionConfiguration": {"EncryptionOption": "SSE_S3"},
},
WorkGroup='myworkgroup'
)
print(query_response)
iteration = 30
temp_file_location: str = "C:\\Users\\<user>\\Desktop\\Python Projects\\tablename.csv"
while(iteration > 0):
iteration = iteration - 1
print(iteration)
query_response_id = athena_client.get_query_execution(QueryExecutionId=query_response['QueryExecutionId'])
print(query_response_id)
if (query_response_id['QueryExecution']['Status']['State'] == 'FAILED') or (query_response_id['QueryExecution']['Status']['State'] == 'CANCELLED'):
print("IF BLOCK: ", query_response_id['QueryExecution']['Status']['State'])
print("The Query Failed.")
elif (query_response_id['QueryExecution']['Status']['State'] == 'SUCCEEDED'):
print("ELSE IF BLOCK: ", query_response_id['QueryExecution']['Status']['State'])
print("Query Completed. Ready to download.")
print("Proceeding to Download File......")
config = TransferConfig(max_concurrency=5)
s3_client = session.client("s3")
s3_client.download_file('131653427868-heor-epi-workbench-results',
f"{query_response['QueryExecutionId']}.csv",
temp_file_location,
Config = config
)
print("Download complete. Setting Iteration to 0 to exit loop. ")
iteration = 0
else:
print("ELSE BLOCK: ", query_response_id['QueryExecution']['Status']['State'])
print(query_response_id['QueryExecution']['Status']['State'])
time.sleep(10)
pandasDF=pd.read_csv(temp_file_location)
print(pandasDF)

LDAP Query in Python3

I have an LDAP Query which is running perfectly fine in my terminal
ldapsearch -h ldap.mygreatcompany.com -D user#mygreatcompany.COM -w "$ldappassword" -b "DC=abc,DC=mygreatcompany,DC=com" -s sub "(mail=user1#mygreatcompany.COM)" sAMAccountName
I want to run this command in python3, I followed other answers from StackOverflow and wrote something like this,
import ldap
l = ldap.initialize('ldap://ldap.mygreatcompany.com')
binddn = "user#mygreatcompany.COM"
pw = #ldappassword
basedn = "DC=abc,DC=mygreatcompany,DC=com"
searchAttribute = ["sAMAccountName"]
searchFilter = "(&(mail=user1#mygreatcompany.COM')(objectClass=*))"
searchScope = ldap.SCOPE_SUBTREE
l.simple_bind_s(binddn, pw)
ldap_result_id = l.search_s(basedn, searchScope, searchFilter, searchAttribute)
#Get result
l.unbind_s()
But Here I am not getting the result from ldap_result_id. Can anybody help what is the correct way to do this query?
Thanks
It turns out that I was not using connection.set_option(ldap.OPT_REFERRALS, 0) in the code and there is some issue in LDAP which automatically chases the referrals internally with anonymous access which fails.
Here is the working code:
def get_user_id(email):
# Seach fiter for user mail
searchFilter = "mail={}".format(email)
try:
# binding to ldap server
connection = ldap.initialize('ldap://yourcompanyhost')
connection.set_option(ldap.OPT_REFERRALS, 0)
connection.protocol_version = ldap.VERSION3
connection.simple_bind_s(binddn, pwd)
# get result
ldap_result_id = connection.search_s(basedn, searchScope, searchFilter, searchAttribute)
# extract the id
saMAccount = ldap_result_id[0][1]["sAMAccountName"][0].decode('utf-8')
except ldap.INVALID_CREDENTIALS:
print("Your username or password is incorrect.")
except ldap.LDAPError as e:
print(e)
except:
print("Data doesn't exist for this user")
connection.unbind_s()
print(get_user_id("user1#mygreatcompany.COM"))

Python3 pika channel.basic_consume() causing MySQL too many connections

I had using pika to make a connection to RabbitMQ and consume message, once I start the script on ubuntu prod environment it is working as expected but is opening mysql connection and never closes them and ends up in Too many connection on mysql server.
Will appreciate any recommendation on the code below, as well can not understand what is going wrong. Thanking you in advance.
The flow is the following
Starting pika on Python3
Subscribe to a channel and waiting for messages
In callback i do various validation and save or update data inside MySql
The result that is showing the problem is the at the end of question a screenshot from ubuntu htop, that is showing new connection on MySql and keep adding them on the top
Pika Verion = 0.13.0
For MySql I use pymysql.
Pika Script
def main():
credentials = pika.PlainCredentials(tunnel['queue']['name'], tunnel['queue']['password'])
while True:
try:
cp = pika.ConnectionParameters(
host=tunnel['queue']['host'],
port=tunnel['queue']['port'],
credentials=credentials,
ssl=tunnel['queue']['ssl'],
heartbeat=600,
blocked_connection_timeout=300
)
connection = pika.BlockingConnection(cp)
channel = connection.channel()
def callback(ch, method, properties, body):
if 'messageType' in properties.headers:
message_type = properties.headers['messageType']
if events.get(message_type):
result = Descriptors._reflection.ParseMessage(events[message_type]['decode'], body)
if result:
result = protobuf_to_dict(result)
model.write_response(external_response=result, message_type=message_type)
else:
app_log.warning('Message type not in allowed list = ' + str(message_type))
app_log.warning('continue listening...')
channel.basic_consume(callback, queue=tunnel['queue']['name'], no_ack=True)
try:
channel.start_consuming()
except KeyboardInterrupt:
channel.stop_consuming()
connection.close()
break
except pika.connection.exceptions.ConnectionClosed as e:
app_log.error('ConnectionClosed :: %s' % str(e))
continue
except pika.connection.exceptions.AMQPChannelError as e:
app_log.error('AMQPChannelError :: %s' % str(e))
continue
except Exception as e:
app_log.error('Connection was closed, retrying... %s' % str(e))
continue
if __name__ == '__main__':
main()
Inside the script i have a model that doing inserts or updated in the database, code below
def write_response(self, external_response, message_type):
table_name = events[message_type]['table_name']
original_response = external_response[events[message_type]['response']]
if isinstance(original_response, list):
external_response = []
for o in original_response:
record = self.map_keys(o, message_type, events[message_type].get('values_fix', {}))
external_response.append(self.validate_fields(record))
else:
external_response = self.map_keys(original_response, message_type, events[message_type].get('values_fix', {}))
external_response = self.validate_fields(external_response)
if not self.mysql.open:
self.mysql.ping(reconnect=True)
with self.mysql.cursor() as cursor:
if isinstance(original_response, list):
for e in external_response:
id_name = events[message_type]['id_name']
filters = {id_name: e[id_name]}
self.event(
cursor=cursor,
table_name=table_name,
filters=filters,
external_response=e,
message_type=message_type,
event_id=e[id_name],
original_response=e # not required here
)
else:
id_name = events[message_type]['id_name']
filters = {id_name: external_response[id_name]}
self.event(
cursor=cursor,
table_name=table_name,
filters=filters,
external_response=external_response,
message_type=message_type,
event_id=external_response[id_name],
original_response=original_response
)
cursor.close()
self.mysql.close()
return
On ubuntu i use systemd to run the script and restart in case something goes wrong, below is systemd file
[Unit]
Description=Pika Script
Requires=stunnel4.service
Requires=mysql.service
Requires=mongod.service
[Service]
User=user
Group=group
WorkingDirectory=/home/pika_script
ExecStart=/home/user/venv/bin/python pika_script.py
Restart=always
[Install]
WantedBy=multi-user.target
Image from ubuntu htop, how the MySql keeps adding in the list and never close it
Error
tornado_mysql.err.OperationalError: (1040, 'Too many connections')
i have found the issue, posting if will help somebody else.
the problem was that mysqld went into infinite loop trying to create indexing to a specific database, after found to which database was trying to create the indexes and never succeed and was trying again and again.
solution was to remove the database and recreate it, and the mysqld process went back to normal. and the infinite loop to create indexes dissapeared as well.
I would say increasing connection may solve your problem temperately.
1st find out why the application is not closing the connection after completion of task.
2nd Any slow queries/calls on the DB and fix them if any.
3rd considering no slow queries/calls on DB and also application is closing the connection/thread after immediately completing the task, then consider playing with "wait_timeout" on mysql side.
According to this answer, if you have MySQL 5.7 and 5.8 :
It is worth knowing that if you run out of usable disc space on your
server partition or drive, that this will also cause MySQL to return
this error. If you're sure it's not the actual number of users
connected then the next step is to check that you have free space on
your MySQL server drive/partition.
From the same thread. You can inspect and increase number of MySQL connections.

Unable to insert into mySQL database using python, variables and auto_increment

I hope my formatting is OK as this is my first time using stackOverflow
No matter how I change my code and methods I keep on getting the same bug when executing this code
File "/usr/lib/python3/dist-packages/mysql/connector/cursor.py",
line 83, in call
return bytes(self.params[index]) IndexError: tuple index out of range
During handling of the above exception, another exception occurred:
Traceback (most recent call last): File "sqlTest.py", line 40, in
mycursor.execute(sql,val) File "/usr/lib/python3/dist-packages/mysql/connector/cursor.py", line 558,
in execute
stmt = RE_PY_PARAM.sub(psub, stmt) File "/usr/lib/python3/dist-packages/mysql/connector/cursor.py", line 86,
in call "Not enough parameters for the SQL statement")
mysql.connector.errors.ProgrammingError: Not enough parameters for the
SQL statement
This is a section of my main project that would log the current values of certain Variable as well as the GPS Coordinates and a timestamp.
From what I've seen the main issue has to do with the database expecting 8 database entries when I should only need 7.
I mainly followed https://www.w3schools.com/python/python_mysql_insert.asp tutorial as I am not super familiar with using python and mySQL together.
#Initialize mySQL databse connection
mydb = mysql.connector.connect(
host="themySQLserver.net",
user="myUSername",
passwd="_____",
database="24totheTLE"
)
These variables are normally set by the main program but I manually set them for troubleshooting
top_since_epoch = 4
left_since_epoch = 1
bottom_since_epoch = 5
right_since_epoch = 3
This is the code that calls the python2 script to get the gps data
fullgps = os.popen("python gps.py").read()
gps_split = fullgps.split(";")
gps_split[1] = gps_split[1].rstrip()
s1 = float(gps_split[0])
s2 = float(gps_split[1])
The primary key "LogNum" for my database is set to auto increment and as such I have not mentioned it in my code.
ts = time.time()
timestam = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
mycursor = mydb.cursor()
sql = "INSERT INTO records (TimeStamp,CarsTop,CarsLeft,CarsRight,CarsBottom,GPSLong,GPSLat) VALUES (%s, %s, %s, %s, %s, %s, %s)"
val = [timestam,top_since_epoch,left_since_epoch,bottom_since_epoch,s2,s1]
mycursor.execute(sql,val)
mydb.commit()
print(mycursor.rowcount, "record inserted.")
Thanks to anyone who replies.
Finally found the answer
The problem was that the code I used to pass the date-time information was not set up to interact with the "mysql.connector" This was solved by updating the code to work with MySQLdb libaries:
This code Worked for me
#Initialize mySQL databse connection
mydb = MySQLdb.connect(
host="sql24.cpt1.host-h.net",
user="stonesp_1",
passwd="______",
database="24totheTLE"
)
print("5 minutes")
fullgps = os.popen("python gps.py").read()
gps_split = fullgps.split(";")
s1 = float(gps_split[0])
s2 = float(gps_split[1])
print(s1)
print(s2)
ts = time.time()
timecur = datetime.datetime.fromtimestamp(ts).strftime('%Y-%m-%d %H:%M:%S')
print(timecur)
mycursor = mydb.cursor()
sql = "INSERT INTO records (TimeStamp,CarsTop,CarsLeft,CarsRight,CarsBottom,GPSLong,GPSLat) VALUES (%s,%s,%s,%s,%s,%s,%s)"
val = [timecur,top_since_epoch,left_since_epoch,right_since_epoch,bottom_since_epoch,s2,s1]
mycursor.execute(sql,val)
mydb.commit()
print(mycursor.rowcount, "record inserted.")
mydb.close()

CouchDB change Database directory

I am trying to change the directory of the couch database. I am using a python script to import a csv file to the CouchDB. Script is running ok. Here it is just in case:
from couchdbkit import Server, Database
from couchdbkit.loaders import FileSystemDocsLoader
from csv import DictReader
import sys, subprocess, math, os
def parseDoc(doc):
for k,v in doc.items():
if (isinstance(v,str)):
#print k, v, v.isdigit()
# #see if this string is really an int or a float
if v.isdigit()==True: #int
doc[k] = int(v)
else: #try a float
try:
if math.isnan(float(v))==False:
doc[k] = float(v)
except:
pass
return doc
def upload(db, docs):
db.bulk_save(docs)
del docs
return list()
def uploadFile(fname, dbname):
#connect to the db
theServer = Server()
db = theServer.get_or_create_db(dbname)
#loop on file for upload
reader = DictReader(open(fname, 'rU'), dialect = 'excel')
docs = list()
checkpoint = 100
i = 0
for doc in reader:
newdoc = parseDoc(doc)
docs.append(newdoc)
if len(docs)%checkpoint==0:
docs = upload(db,docs)
i += 1
print 'Number : %d' %i
#don't forget the last batch
docs = upload(db,docs)
if __name__=='__main__':
x = '/media/volume1/Crimes_-_2001_to_present.csv'
filename = x
dbname = 'test'
uploadFile(filename, dbname)
I saw plenty posts on how to change the directory for appending the database. If I leave the /etc/couchdb/local.ini as it is (original after installation) the script is appending data to the default directory /var/lib/couchdb/1.0.1/. When I modify the local.ini to store the database to another disk:
database_dir = /media/volume1
view_index_dir = /media/volume1
and after the reboot of the CouchDB service I get this error :
restkit.errors.RequestError: socket.error: [Errno 111] Connection refused
I have checked the open sockets (couchdb uses 5984 as default) and it is not opened. But I get no errors when I start CouchDB service.
Any ideas how to fix it ?
I think the error may be due to you have changed the directory location in Local.ini but when you are trying to make new connection to existing database, it cannot find it there.
So move the database_name.couch file to new location which you can put in local.ini and then try to make a connection. I think this should work.

Resources