How to Encode ( utf-8) in Pandas (Excel as source) - python-3.x

I am trying to read from excel and load into Mongodb using Pymongo.
The Error I got cannot is "encode object: , of type: <class 'pandas._libs.missing.NAType'>", when researched, I was told to use utf-8-sign format to insert it into monogodb, but in pandas dataframe there is no option to use utf-8
from pymongo import MongoClient
from datetime import datetime
import pandas as pd
import Parameters
import pandasql as pf
import json
import pymongo
import xlrd
from pathlib import Path
import os
import constants
try:
class conn:
def __init__(self):
client = pymongo.MongoClient( "mongodb://" + constants.USER_NAME + ":" + constants.PWD + constants.server + constants.CA_CERTIFICATES_PATH)
db = client[Parameters.STG_QC_Hub_Files]
week="08-02-2021"
out_col = db[Parameters.col]
filename = "1.xlsx"
path1 = Path('//test3'+'/'+filename)
data_load_date = datetime.today().strftime('%m-%d-%Y')
df1=pd.read_excel(path1,sheet_name="AU-ARCM Details",keep_default_na=False)
# df1 = pd.read_excel(xls+filename,keep_default_na=False,encoding='utf-8-sig')
# df1 = pd.read_csv(xls,keep_default_na=False,encoding='utf-8-sig').iloc[:, : 86]
df1["Week"]=week
df1["Data Load Date"]=data_load_date
df1 = df1.astype('string')
# df1.index = df1.index.str.encode('utf-8')
df1=df1.drop(['Source.Name'], axis=1)
records = json.loads(df1.T.to_json()).values()
out_col.insert_many(df1.to_dict('records'))
print("Imported File " +str(filename)+" with " +str(len(records) )+ " records")
c = conn()
except Exception as e:
print(e)
Traceback:
File "C:\Users\PycharmProjects\ReMs\venv\lib\site-packages\pymongo\message.py", line 1323, in _do_batched_op_msg
operation, command, docs, check_keys, ack, opts, ctx)
bson.errors.InvalidDocument: cannot encode object: <NA>, of type: <class 'pandas._libs.missing.NAType'>

You have some blank cells in your spreadsheet that pandas has its own type (NAT) for; pymongo doesn't know what to do with this type, hence the error. You will need to remove any of these in order to load the values into mongodb using the method you are using.
Consider something like this just before you attempt the insert:
import numpy as np
df1 = df1.replace(np.nan, None)

Related

How to load data from a connection string with vaex package?

If I have a table on my server and I am producing a connection string to it, how can I, using Vaex, load it to a dataframe?
Here is what I am doing but with Pandas:
from sqlalchemy import types, create_engine, text
import pandas as pd
import pymysql
def connect_to_data(driver='mysql+pymysql://', conn_string=''):
try:
conn = create_engine(driver + conn_string)
print("MySQL Connection Successfull!")
except Exception as err:
print("MySQL Connection Failed!")
print(err)
return conn
# Connect to the db:
conn_string = 'xxxxxxxx'
conn = connect_to_data(conn_string=conn_string)
# Get all requests from the db:
query = '''SELECT * FROM table_name'''
result = conn.execute(text(query))
# Desired dataframe:
df = pd.read_sql_query(query, conn)
How can I do the same with Vaex (because of it's high performance)?
For now at least, you can't do it directly. But vaex can easily read a pandas dataframe so you can
# Following your example..
pandas_df = pd.read_sql_query(query, conn)
df = vaex.from_pandas(pandas_df)

extract multiple URLs using the datetime function

In this program i am not using request or beautiful soup function. I'm instead only using the datetime to extract the URLs. Now in the current program, I have written to extract the values for a long period. I want to make it in such a way that, if I automate this program and it runs today, it will extract yesterday's data. Similarly if it runs tomorrow, it will extract todays data and so on.
here is the code,
import datetime
from datetime import date, datetime,timedelta
import warnings
import datetime
import pandas as pd
import wget
import glob
import os
warnings.filterwarnings("ignore")
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from urllib.error import HTTPError
def date_range(start_date,end_date):
for n in range(int((end_date-start_date).days)):
yield start_date + timedelta(n)
def get_urls(base_url):
part_two = "/dailyCoal1-"
end_part = ".xlsx"
start_date = date(2020,11,1)
end_date = datetime.datetime.now().date()
start_urls = list()
for single_date in date_range(start_date, end_date):
start_urls.append(single_date.strftime(base_url+'%d-%m-%Y'+part_two+'%Y-%m-%d'+end_part))
return start_urls
def excel_download(link,out):
#downloads a given link provided to a output directory in out
wget.download(link,out)
if __name__ =="__main__":
base_url = "https://npp.gov.in/public-reports/cea/daily/fuel/"
mypath = "/Users/vp/Desktop/temp"
temp_folder = '/Users/vp/Desktop/temp'
out_folder = "/Users/vp/Desktop/NPP"
log_file = os.path.join(out_folder,'debug_log_npp.log')
out_file = os.path.join(out_folder,'Energy_inputs_npp.csv')
file_links = get_urls(base_url)
for link in file_links:
try:
excel_download(link,temp_folder)
except HTTPError:
content = "HTTP issue while capturing data for this link - " + link
log_writer(log_file,content)
continue
file = glob.glob(os.path.join(temp_folder,'*.xlsx'),recursive=True)[0]
df = pd.read_excel(file)
To capture yesterday's data, i created this in the main function where i check for yesterday = and then cancel if it isnt yesterday. But then its throwing error as it constantly picks the start date as its day one.
if(date_time_obj != Yesterday):
os.remove(file)
content = "Date mis-matched - " + str(date_time_obj) + " " + str(Yesterday)
In this program, date_time_obj - is the date it is currently trying to extract data for.
Everyday if this program runs at 8pm, it needs to only capture one day before data on a daily basis.
if this cannot be done in datetime, but only on request or bs4, then how do i approach this problem?
I don't know if you wanted a valid link as your code doesn't seem to produce those for me but you only need to tweak to work off start_date only and return a single item to return yesterday's link matching with your current output for same date.
import datetime
from datetime import date, datetime,timedelta
import warnings
import datetime
import pandas as pd
import glob
import os
warnings.filterwarnings("ignore")
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
from urllib.error import HTTPError
def get_url(base_url):
part_two = "/dailyCoal1-"
end_part = ".xlsx"
start_date = datetime.datetime.now().date() + timedelta(-1)
start_url = start_date.strftime(base_url+'%d-%m-%Y'+part_two+'%Y-%m-%d'+end_part)
return start_url
def excel_download(link,out):
#downloads a given link provided to a output directory in out
wget.download(link,out)
if __name__ =="__main__":
base_url = "https://npp.gov.in/public-reports/cea/daily/fuel/"
mypath = "/Users/vp/Desktop/temp"
temp_folder = '/Users/vp/Desktop/temp'
out_folder = "/Users/vp/Desktop/NPP"
log_file = os.path.join(out_folder,'debug_log_npp.log')
out_file = os.path.join(out_folder,'Energy_inputs_npp.csv')
file_link = get_url(base_url)
print(file_link)

Inserting values in a table using psycopg2

I am trying to insert data in a "Dummy" table in postgres SQL using psycopg2 and faker library. This is a table that I have created only for learning purpose. It has only one column Student_name which is of type char[]. Below is my Python script
import psycopg2
from faker import Faker
fake = Faker()
conn = psycopg2.connect(database="kreiotdb", user="****", password="*****", host="127.0.0.1", port="5432")
print("Connected Successfuly")
cur = conn.cursor()
for i in range (10):
name = fake.name()
cur.execute(""" INSERT INTO "Dummy" ("Student_name") VALUES (%s);""",[name])
It is giving me the following error when I run the script. The connection is successful
Fri Nov 02 12:16:07 gaurav ~ $ python3 /Users/gaurav/Desktop/populate.py
Connected Successfuly
Traceback (most recent call last):
File "/Users/gaurav/Desktop/populate.py", line 11, in <module>
cur.execute(""" INSERT INTO "Dummy" ("Student_name") VALUES (%s);""",[name])
psycopg2.DataError: malformed array literal: "Brent Allison"
LINE 1: INSERT INTO "Dummy" ("Student_name") VALUES ('Brent Allison...
^
DETAIL: Array value must start with "{" or dimension information.
Why is it giving me this error and should I do ?
Please help.
import os
import csv
import sys
import psycopg2
import json
import csv
#import xlsxwriter
#import configparser
import psycopg2.extras
import psycopg2.extensions
#import logging
#import logging.config
import datetime
import zipfile
from subprocess import call
def db_connect():
dbconn = None
#if conf_section in config == False:
# print("Given section -> {0} is not exists in conf file.".format(conf_section))
# return None
dbhost = ""
dbport = ""
dbname = ""
dbuser = ""
dbpass = ""
try:
dbconn = psycopg2.connect(host=dbhost, port=dbport, dbname=dbname, user=dbuser, password=dbpass)
dbconn.autocommit = True
except Exception as e:
print(e)
return None
finally:
return dbconn
def execute_query(dbconn, query):
nrows = cursor = None
colnames = result = []
try :
cursor = dbconn.cursor(cursor_factory=psycopg2.extras.DictCursor)
cursor.execute(query)
except Exception as e:
print(e)
return (0, colnames, result)
nrows = cursor.rowcount
colnames = [desc[0] for desc in cursor.description]
result = cursor.fetchall()
#cursor.close()
return (nrows)
def parse_csv(default_data):
with open('key.csv') as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
tid=row['TID']
mid=row ['MID']
key=row ['Exported Key ']
kcv=row ['KCV']
serial_no=row['HarwardSerialNo']
print ("TID="+tid+" MID="+mid+" EXPORTED KEY="+key+" KCV="+kcv)
request_data=default_data+key
request_data.replace(" ", "")
print (request_data)
cmd="/home/siva/HSM_REQ/hsm_comms.out 192.168.5.51 4000"+request_data
response_data=os.system(cmd)
print (response_data)
dbconn = db_connect()
query="select * from hsm_keys where serial_no ='"+serial_no+"'";
rows=execute_query(dbconn, query)
print (rows)
if (rows==0):
query="INSERT ";
print (query)
return()
def main():
header="0101303200"
head_len="1D"
fun_code="EE0200"
fun_mod="00"
key_len="05"
key_spc="081002"
key_index="0004"
key_type="0500"
len_of_key="10"
default_data=header+head_len+fun_code+fun_mod+key_len+key_spc+key_index+key_type+len_of_key
print (default_data)
parse_csv(default_data)
if __name__ == '__main__':
main()

Unable to read data about FCN,i don't know how to do.the result is:Found pickle file! 0 0

import numpy as np
import os
import random
from six.moves import cPickle as pickle
from tensorflow.python.platform import gfile
import glob
import TensorflowUtils as utils
DATA_URL = 'http:\\data.csail.mit.edu\\places\\ADEchallenge\\ADEChallengeData2016.zip'
#download and read dataset
def read_dataset(data_dir):
pickle_filename = "MITSceneParsing.pickle"
pickle_filepath = os.path.join(data_dir, pickle_filename)
if not os.path.exists(pickle_filepath):
utils.maybe_download_and_extract(data_dir, DATA_URL, is_zipfile=True)
SceneParsing_folder = os.path.splitext(DATA_URL.split("/")[-1])[0]
result = create_image_lists(os.path.join(data_dir, SceneParsing_folder))
print ("Pickling ...")
with open(pickle_filepath, 'wb') as f:
pickle.dump(result, f, pickle.HIGHEST_PROTOCOL)
else:
print ("Found pickle file!")
with open(pickle_filepath, 'rb') as f:
result = pickle.load(f)
training_records = result['training']
validation_records = result['validation']
del result
return training_records, validation_records
train_records, valid_records = read_dataset('Data_zoo/MIT_SceneParsing')
print(len(train_records))
print(len(valid_records))
the result is:Found pickle file! 0 0
why the lens about train_records and valid_records are 0?
i don't know whree is wrong and how to correct it.
This code is right. The bug is in 'create_image_lists'.
Note this code in create_image_lists:
filename = os.path.splitext(f.split('/')[-1])[0]
This is no problem in Linux, but in windows, the separator is '\\', so you should modify this code to:
filename = os.path.splitext(f.split('\\')[-1])[0]
Then delete this file 'MITSceneParsing.pickle', and run read_dataset again.

Cassandra ExecutionResult on importlib._bootstrap failed

I am trying to execute multi-process to pull the data from Cassandra. But, I'm facing the issue.I want to pull it for single key or multiple keys using the multi-process provided my Cassandra
My cassandra_db class
from cassandra.cluster import Cluster
import cassandra
import pandas as pd
import numpy as np
from datetime import datetime
import sys
import os
from threading import Event
import itertools
from multiprocessing import Pool
from cassandra.concurrent import execute_concurrent_with_args
from cassandra.query import tuple_factory
ip_address = '127.0.0.1'
class cassandra_db(object):
concurrency = 2 # chosen to match the default in execute_concurrent_with_args
def __init__(self,process_count=None):
self.pool = Pool(processes=process_count, initializer=self._setup)
#classmethod
def _setup(cls):
cls.session = Cluster([ip_address]).connect(keyspace='test')
cls.session.row_factory = pandas_factory
cls.prepared = cls.session.prepare('SELECT * FROM tr_test WHERE key=?')
def close_pool(self):
self.pool.close()
self.pool.join()
def get_results(self, params):
try:
xrange
except NameError:
xrange = range
params = list(params)
print("-----> ",params)
print("-----+>",self.concurrency)
self.pool.map(_multiprocess_get, (params[n:n + self.concurrency] for n in xrange(0, len(params), self.concurrency)))
#classmethod
def _results_from_concurrent(cls, params):
return execute_concurrent_with_args(cls.session, cls.prepared, params)
def _multiprocess_get(params):
return cassandra_db._results_from_concurrent(params)
My calling class
import os
import pandas as pd
import sys
relative_path='/home/anji'
sys.path.append(os.path.join(relative_path ,'commons','Database Operations'))
from cassandra.cluster import Cluster
from cassandra.auth import PlainTextAuthProvider
from cassandra_db import cassandra_db
from cassandra.policies import ConstantReconnectionPolicy
processes =2
con_db = cassandra_db(processes)
keys=[(1,),(2,)]
df = con_db.get_results(keys)
print("Result",df.head())
Error:
multiprocessing.pool.MaybeEncodingError: Error sending result: '[[ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x7fa93658bbe0>), ExecutionResult(success=True, result_or_exc=<cassandra.cluster.ResultSet object at 0x7fa936a2e0f0>)]]'. Reason: 'PicklingError("Can't pickle <class 'importlib._bootstrap.ExecutionResult'>: attribute lookup ExecutionResult on importlib._bootstrap failed",)'
My trying to execute for 2 keys but facing the issue. Can any help me to solve this issue

Resources