Extracting data in separate sheet excel using Python - python-3.x

I want to write a python script from which I can execute multiple sql queries and the output of that query is saved in excel.
Suppose I have 4 sql query i.e Script1, Script2, Script3 & Script4 and I want to save the generated excel workbook in E:\Test, In that workbook sheet1 contains Script1 output, sheet2 contains Script2 output, sheet3 contains Script3 output, and so on. I have written a query but its working for only one script.
By using this script I am able to generate excel sheet with Test name, but How I run the remaining script so that their output will show in other sheet of same workbook
Please Help
import psycopg2
import sys
import pprint
import pandas as pd
import os
import openpyxl.cell
COMMASPACE = ', '
def main():
conn_string = "dbname='abc' user='qwerty' host='pqr' password='******' port='1234'"
script1 = """
select * From something1
"""
script2 = """
select * From something2
"""
script3 = """
select * From something3
"""
script4 = """
select * From something4
"""
pprint.pprint ('Making connection to the Database...')
con1 = psycopg2.connect(conn_string)
cur = con1.cursor()
pprint.pprint ('Execution Start')
cur.execute(script)
if not cur.rowcount:
pprint.pprint ('Oops! Error Occured')
else:
columns = [desc[0] for desc in cur.description]
data = cur.fetchall()
df = pd.DataFrame(list(data), columns=columns)
df.columns = map(str.upper, df.columns)
writer = pd.ExcelWriter('E:\\Test.xlsx')
df.to_excel(writer, sheet_name='Sheet1')
def hide_column(ws, column_id):
if isinstance(column_id, int):
assert column_id >= 1, "Column numbers must be 1 or greater"
column_id = openpyxl.cell.get_column_letter(column_id)
column_dimension = ws.column_dimensions[column_id]
column_dimension.hidden = True
writer.save()
print ("END of extraction")
if __name__ == "__main__":
main()

try using pandas read_sql with Sql Alchemy.
from openpyxl import load_workbook
from sqlalchemy import create_engine
import pandas as pd
# Parameters for SQL Alchemy
ServerName = "your_Server_Name"
Database = "Your_Database"
Driver = "Your_Driver"
# Create the connection
engine = create_engine('mssql+pyodbc://' + ServerName + '/' + Database + "?" + Driver)
# reading in the dataframes
df1 = pd.read_sql_query("select * from somewhere", engine)
df2 = pd.read_sql_query("select * from somewhere_else", engine)
# Using openpyxl to write to excel sheets
file = 'Your_file_path_Here'
book = load_workbook(file)
writer = pd.ExcelWriter(file, engine='openpyxl')
writer.book = book
# now start writing them to sheets
df1.to_excel(writer, index=None, sheet_name='SQL1')
df1.to_excel(writer, index=None, sheet_name='SQL2')

Related

Issues with multiprocessing and import get pass

Trying to write a multiprocessing code using the import get pass module
import time
from multiprocessing import Pool
from multiprocessing import freeze_support
import getpass
import jaydebeapi
import pandas as pd
import numpy as np
pw = getpass.getpass(prompt="Password", stream=False)
# establishing database to the ODS database
ODS = jaydebeapi.connect(
'com.ibm.db2.jcc.DB2Driver',
'jdbc:db2://he3qlxvtdbs351.fhlmc.com:50001/DB2QLTY',
['f408195', pw],
'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the ODS database
ODS = ODS.cursor()
# creating the password needed to establish PML database connection
pw_2 = getpass.getpass(prompt="Password", stream=False)
# establishing database to the PML database
PML = jaydebeapi.connect(
'com.ibm.db2.jcc.DB2Driver',
'jdbc:db2://he3qlxvtdbs957.fhlmc.com:50001/PMLFDB2',
['f408195', pw_2],
'C:/JDBC/db2jcc.jar')
# Allows SQL statements between the PML database
PML = PML.cursor()
def test(first_evnt, last_evnt):
PML_loan_Query = "select b.id_lpa_alt_loan from udbadm.pml_lst_cmpltd_trans_mtch a join udbadm.lpa_altv_loan_idtn b on a.id_evnt = b.id_evnt where b.cd_lpa_alt_loan_idtn = 'HewlettPackardGeneratedTransaction' and a.id_evnt BETWEEN ? AND ?"
PML.execute(PML_loan_Query,(first_evnt, last_evnt))
loan_records = PML.fetchall()
df = pd.DataFrame()
for x in loan_records:
# Populating the ODS table
#borr_query = "SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(100)) AS cd_idx, CAST(rate_curr_int AS INT) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS INT) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS INT) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS INT) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
borr_query = 'SELECT nbr_aus, CAST(NULLIF(NULLIF(cd_idx, -9999), 0.000000) AS VARCHAR(10)) AS cd_idx, CAST(rate_curr_int AS VARCHAR(10)) AS rate_curr_int, CAST(NULLIF(rate_gr_mrtg_mrgn,0) AS VARCHAR(10)) AS rate_gr_mrtg_mrgn, CAST(rate_loln_max_cap AS VARCHAR(10)) AS rate_loln_max_cap, CAST(NULLIF(rate_perdc_cap,0) AS VARCHAR(10)) AS rate_perdc_cap FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus IN (?)'
#borr_query = "SELECT DISTINCT nbr_aus FROM DB2MANT.I_LP_TRANS WHERE nbr_trans_aus BETWEEN ? AND ?"
ODS.execute(borr_query, x)
#ODS.execute(ODS_list)
ODS_records = ODS.fetchall()
ODS_records = df.append(pd.DataFrame(ODS_records, columns = ['nbr_aus', 'cd_idx', 'rate_curr_int', 'rate_gr_mrtg_mrgn', 'rate_loln_max_cap', 'rate_perdc_cap']))
return ODS_records
if __name__ == '__main__':
freeze_support()
first_evnt = 155643917
last_evnt = 155684481
p = Pool()
result = p.map(test, [first_evnt, last_evnt])
print(result)
p.close()
p.join()
I saved this script into a .py file and I tried to run it in the command prompt, it asked for my password for my ODS database, then my PML database, and then it seems to keep running the getpass command over and over again.
Below in a picture for reference.
enter image description here python script terminal

Save scraping results one by one into Excel or CSV file in Python

I have a crawler code as follows:
import requests
import json
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
from datetime import datetime
def crawl(id):
try:
url = 'https://www.china0001.com.cn/project/{0:06d}.html'.format(id)
print(url)
content = requests.get(url).text
soup = BeautifulSoup(content, 'lxml')
tbody = soup.find("table", attrs={"id":"mse_new"}).find("tbody", attrs={"class":"jg"})
tr = tbody.find_all("tr")
rows = []
for i in tr[1:]:
rows.append([j.text.strip() for j in i.findAll("td")])
out = dict([map(str.strip, y.split(':')) for x in rows for y in x])
return out
except AttributeError:
return False
data = list()
for id in range(699998, 700010):
print(id)
res = crawl(id)
if res:
data.append(res)
if len(data) > 0:
df = pd.DataFrame(data)
df.to_excel('test.xlsx', index = False)
In this code, the result dataframe df will be writen to an Excel file after the whole scraping process is finished.
Now I want to save the scraping results one by one into Excel or CSV file during the scraping process, how could I modify the code above?
Thanks.
Updates:
MAX_WORKERS = 30
ids = range(700000, 700050)
workers = min(MAX_WORKERS, len(ids))
with futures.ThreadPoolExecutor(workers) as executor:
res = executor.map(crawl, sorted(ids))
data = list(res)
if len(data) > 0:
df = pd.DataFrame(data)
df.to_csv('test.csv', mode = 'a', header = True, index = False)
Try Using to_csv with header=False, index=False
Ex:
for id in range(699998, 700010):
res = crawl(id)
if res:
df = pd.DataFrame([res])
df.to_csv('test.csv', mode='a', header=False, index=False)
I’d recommend looking at my question on here:
What is the problem with the pandas to csv in my code?.
I’d recommend looking at the answers for the daily sheets then apply and modify it to fit your program

Apply same process on multiple files on the same folder using Python

I need to read all the csv files on a specific folder and then apply a specific process (calculate some parametres) on each file and for each file I need to ceate an excel file in which I have to store the results.
For now I have been able to apply the calculation for each file manually, but I need to automate the process, which means the only input should be the folder's name instead of going through the folder and each csv file is considered as an input.
I have been advised to use Pandas for the matter but I couldn't figure out how.
My question is, is it even possible to do it with Python?
This is a part of my code :
main.py
from Dlt2Excel_Fct import *
from ModePrvPblc_Fct import *
from FilePaths import filename_csv, filename_asc, filepath
start = time.time()
dlt2excel()
ModePrvPblcGps()
duree = time.time()-start
print('duree', duree)
Dlt2Excel_Fct.py
import pandas as pd
import xlsxwriter
import sys
import os
from tkinter import filedialog
from tkinter import *
from FilePaths import filename_csv
def dlt2excel():
""" Enter the directory of the exported csv file"""
user_input=filename_csv
# user_input = input("Enter the path of your file: ")
assert os.path.exists(user_input), "I did not find the file at, "+str(user_input)
f = open(user_input,'r+')
print("We found your file!")
"""Organize the exported file """
inputFile = f
workbook = xlsxwriter.Workbook('output01.xlsx')
worksheet = workbook.add_worksheet()
exportFile = open('output01.xlsx', 'w')
workbook.close()
for line in inputFile:
new_line = line.replace(',', '\t')
exportFile.write(new_line)
f.close()
inputFile.close()
exportFile.close()
df = pd.read_table('output01.xlsx', error_bad_lines=False) # for '\t'
df.to_excel('output1.xlsx', 'Sheet1')
"""Count the number of duplicates """
data = pd.read_excel(r'output1.xlsx', header = 0)
data.count()
data['count'] = data.groupby(['Payload'])['Index'].transform('count')
data.to_excel('OutputDLT.xlsx', sheet_name='sheet1', index=False)
print("Conversion is done!\n")
ModePrvPblc_Fct.py
import openpyxl
from openpyxl import Workbook
from openpyxl import load_workbook
#from ExcelName import filepath
from FilePaths import filepath
filename =filepath
def ModePrvPblcGps():
file_name='OutputDLT.xlsx'
wb = openpyxl.load_workbook(file_name, read_only=False)
ws = wb.active
sheet = wb['sheet1']
ls = []
PsgPrv=0
PsgPblc=0
for row in ws.iter_rows():
for cell in row:
#print('Cell: [{}] is type({}): "{}"'.format(cell.coordinate, type(cell.value).__name__, cell.value))
if cell.value == 'SQLR: K<ATT_PRIVACY_MODE> V<1>':
PsgPrv+=1
if cell.value == 'SQLR: K<ATT_PRIVACY_MODE> V<0>':
PsgPblc+=1
print('Passage en mode public: ', PsgPblc)
print('Passage en mode privé: ', PsgPrv)
wb = load_workbook(filename)
ws = wb.worksheets[0]
parametres = (
['Passage en mode privé ', PsgPrv],
['Passage en mode public ', PsgPblc],
)
for row_ in (parametres):
ws.append(row_ )
wb.save(filename)
FilePaths.py
import tkinter as tk
from tkinter.simpledialog import askstring
from tkinter import filedialog
import os
import openpyxl
import warnings
warnings.filterwarnings("ignore")
root = tk.Tk()
folder_selected = filedialog.askdirectory()
print(folder_selected)
path=folder_selected + "/"
nom = askstring("Name", "Enter the name of the result file")
print(nom)
if nom == None:
nom= str(None)
else:
nom = nom +".xlsx"
if not os.path.exists(path):
os.makedirs(path)
filepath = path +nom
if not os.path.isfile(filepath):
wb = openpyxl.Workbook(filepath)
wb.save(filename = filepath)
root.file_name = filedialog.askopenfilename(initialdir = "/",title = "Select csv file",filetypes = (("csv files","*.csv"),("all files","*.*")))
filename_csv=root.file_name
print (filename_csv)
root.file_name1 = filedialog.askopenfilename(initialdir = "/",title = "Select trace file",filetypes = (("asc files","*.asc"),("all files","*.*")))
filename_asc=root.file_name1
print (filename_asc)
root.withdraw()
I've many folders that contain multiple csv file that's why I need to automate the process.

how to create table into SQLite3 from importing excel data in python?

In my code, I am importing data from excel file into an SQLite database using python.
it doesn't give any error but it converts every excel column name into a table.
I have multiple excel files with the same data structure, containing 40K rows and 52 columns each file.
when I am importing these file data into SQLite database using python code it converts each column header name into a table.
import sqlite3
import pandas as pd
filename= gui_fname()
con=sqlite3.connect("cps.db")
wb = pd.read_excel(filename,sheet_name ='Sheet2')
for sheet in wb:
wb[sheet].to_sql(sheet,con,index=False,if_exists = 'append')
con.commit()
con.close()
it should create a table with the name of Sheet which I am importing.
I do some hit and trial and found the solution:
I just put con.commit() within the for loop and it works as required, but I didn't get the logic.
I will appreciate if anyone can explain to me this.
import sqlite3
import pandas as pd
filename= gui_fname()
con=sqlite3.connect("cps.db")
wb = pd.read_excel(filename,sheet_name = 'Sheet2')
for sheet in wb:
wb[sheet].to_sql(sheet,con,index=False,if_exists = 'append')
con.commit()
con.close()
import pandas as pd
def import_excel_to_sqlite_db(excelFile):
df = pd.read_excel(excelFile)
con = sqlite3.connect("SQLite.db")
cur = con.cursor()
results = cur.execute("Select * from TableName")
final = df.to_sql("TableName", con, if_exists="append", index=False)
pd.DataFrame(results, columns=final)
con.commit()
cur.close()

Inserting values in a table using psycopg2

I am trying to insert data in a "Dummy" table in postgres SQL using psycopg2 and faker library. This is a table that I have created only for learning purpose. It has only one column Student_name which is of type char[]. Below is my Python script
import psycopg2
from faker import Faker
fake = Faker()
conn = psycopg2.connect(database="kreiotdb", user="****", password="*****", host="127.0.0.1", port="5432")
print("Connected Successfuly")
cur = conn.cursor()
for i in range (10):
name = fake.name()
cur.execute(""" INSERT INTO "Dummy" ("Student_name") VALUES (%s);""",[name])
It is giving me the following error when I run the script. The connection is successful
Fri Nov 02 12:16:07 gaurav ~ $ python3 /Users/gaurav/Desktop/populate.py
Connected Successfuly
Traceback (most recent call last):
File "/Users/gaurav/Desktop/populate.py", line 11, in <module>
cur.execute(""" INSERT INTO "Dummy" ("Student_name") VALUES (%s);""",[name])
psycopg2.DataError: malformed array literal: "Brent Allison"
LINE 1: INSERT INTO "Dummy" ("Student_name") VALUES ('Brent Allison...
^
DETAIL: Array value must start with "{" or dimension information.
Why is it giving me this error and should I do ?
Please help.
import os
import csv
import sys
import psycopg2
import json
import csv
#import xlsxwriter
#import configparser
import psycopg2.extras
import psycopg2.extensions
#import logging
#import logging.config
import datetime
import zipfile
from subprocess import call
def db_connect():
dbconn = None
#if conf_section in config == False:
# print("Given section -> {0} is not exists in conf file.".format(conf_section))
# return None
dbhost = ""
dbport = ""
dbname = ""
dbuser = ""
dbpass = ""
try:
dbconn = psycopg2.connect(host=dbhost, port=dbport, dbname=dbname, user=dbuser, password=dbpass)
dbconn.autocommit = True
except Exception as e:
print(e)
return None
finally:
return dbconn
def execute_query(dbconn, query):
nrows = cursor = None
colnames = result = []
try :
cursor = dbconn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cursor.execute(query)
except Exception as e:
print(e)
return (0, colnames, result)
nrows = cursor.rowcount
colnames = [desc[0] for desc in cursor.description]
result = cursor.fetchall()
#cursor.close()
return (nrows)
def parse_csv(default_data):
with open('key.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
tid=row['TID']
mid=row ['MID']
key=row ['Exported Key ']
kcv=row ['KCV']
serial_no=row['HarwardSerialNo']
print ("TID="+tid+" MID="+mid+" EXPORTED KEY="+key+" KCV="+kcv)
request_data=default_data+key
request_data.replace(" ", "")
print (request_data)
cmd="/home/siva/HSM_REQ/hsm_comms.out 192.168.5.51 4000"+request_data
response_data=os.system(cmd)
print (response_data)
dbconn = db_connect()
query="select * from hsm_keys where serial_no ='"+serial_no+"'";
rows=execute_query(dbconn, query)
print (rows)
if (rows==0):
query="INSERT ";
print (query)
return()
def main():
header="0101303200"
head_len="1D"
fun_code="EE0200"
fun_mod="00"
key_len="05"
key_spc="081002"
key_index="0004"
key_type="0500"
len_of_key="10"
default_data=header+head_len+fun_code+fun_mod+key_len+key_spc+key_index+key_type+len_of_key
print (default_data)
parse_csv(default_data)
if __name__ == '__main__':
main()

Resources