I'm trying to load some data in Hana 2.0 database from Python 3.x Juypter Notebooks and getting an error.
Please advise.
the type of the column is INT as Pregnancies, the structure is as below
and in the CSV file, the data is a number
So above data should go to Pregnancies column.
Error I'm getting, while inserting data is below:-
hdbcli.dbapi.Error: (-10427, "Conversion of parameter/column (1) from
data type UCS2 (LE) to INT failed (invalid number: not a valid number
string 'Pregnancies')")
its big script..I'm getting issue in insert method
#staticmethod
def insert_data(connection,tablename,cols,inlist,data, batch_size):
sql = 'insert into ' + tablename + inlist
if len(data) > 0:
with connection.connection.cursor() as cur:
rows_inserted = cur.executemany(sql, data)
#staticmethod
def file_load(connection,table_descriptions,cols, inlist, filename,file_count,train_percentage,valid_percentage,test_percentage,batch_size):
with open(filename, 'r') as my_file:
reader = csv.reader(my_file, delimiter=',')
data = list()
data_list = list()
load_count = 0
for row in reader:
remain_count = file_count - load_count
if remain_count < batch_size:
batch_size = remain_count
if len(data) <= batch_size:
data.append(list(row))
if len(data) == batch_size:
DataSets.split_data_into_tables(connection,data,table_descriptions,train_percentage,valid_percentage,test_percentage, cols, inlist, batch_size, file_count)
load_count += len(data)
data = list()
print("Data Loaded:{}%".format(math.floor(load_count/file_count*100)))
whole code for this is in below link
https://github.com/SAP-samples/hana-ml-samples/blob/master/Python-API/pal/notebooks/data_load_utils.py
Based on the error message text, I’m rather sure that the problem is that the import code tries to read the first line of the CSV file (the line that contains the column names/headers) and importing these names to the target table is what fails.
To avoid this, just skip the first line of the CSV file.
the error says that the data in the column is of type 'UCSE' which is a character encoding standard in which characters are represented by a fixed-length 16 bits (2 bytes) and you have defined the columns as 'INT' type.
so change the type to 'INT' in the CSV file first and then load the data.
Related
I wanted to create a program to convert CSV files to DXF(AutoCAD), but the CSV file sometimes comes with a header and sometimes no and there are cells that cannot be empty such as coordinates, and I also noticed that after excluding some of the inputs the value is nan or NaN and it was necessary to get rid of them so I offer you my answer and please share your opinions to implement a better method.
sample input
output
solution
import string
import pandas
def pandas_clean_csv(csv_file):
"""
Function pandas_clean_csv Documentation
- I Got help from this site, it's may help you as well:
Get the row with the largest number of missing data for more Documentation
https://moonbooks.org/Articles/How-to-filter-missing-data-NAN-or-NULL-values-in-a-pandas-DataFrame-/
"""
try:
if not csv_file.endswith('.csv'):
raise TypeError("Be sure you select .csv file")
# get punctuations marks as list !"#$%&'()*+,-./:;<=>?#[\]^_`{|}~
punctuations_list = [mark for mark in string.punctuation]
# import csv file and read it by pandas
data_frame = pandas.read_csv(
filepath_or_buffer=csv_file,
header=None,
skip_blank_lines=True,
error_bad_lines=True,
encoding='utf8',
na_values=punctuations_list
)
# if elevation column is NaN convert it to 0
data_frame[3] = data_frame.iloc[:, [3]].fillna(0)
# if Description column is NaN convert it to -
data_frame[4] = data_frame.iloc[:, [4]].fillna('-')
# select coordinates columns
coord_columns = data_frame.iloc[:, [1, 2]]
# convert coordinates columns to numeric type
coord_columns = coord_columns.apply(pandas.to_numeric, errors='coerce', axis=1)
# Find rows with missing data
index_with_nan = coord_columns.index[coord_columns.isnull().any(axis=1)]
# Remove rows with missing data
data_frame.drop(index_with_nan, 0, inplace=True)
# iterate data frame as tuple data
output_clean_csv = data_frame.itertuples(index=False)
return output_clean_csv
except Exception as E:
print(f"Error: {E}")
exit(1)
out_data = pandas_clean_csv('csv_files/version2_bad_headers.csl')
for i in out_data:
print(i[0], i[1], i[2], i[3], i[4])
Here you can Download my test CSV files
I have an issue converting a chunked list into multiple dictionaries in order to send my request batched:
fd = open(filename, 'r')
sqlFile = fd.read()
fd.close()
commands = sqlFile.split(';')
for command in commands:
try:
c = conn.cursor()
c.execute(command)
// create a list with the query results with batches of size 100
for batch in grouper(c.fetchall(),100):
// This is where the error occurs:
result = [dict(zip([key[0] for key in c.description], i)) for i in batch]
# TODO: Send the json with 100 items to API
except RuntimeError:
print('Error.')
The issue is that it only iterates through the batches once and gives the following error. Actually, the number of rows are 167. So there should be a result of 100 items to be sent in a first request, while the second iteration should contain 67 items to be sent in a second request.
TypeError: zip argument #2 must support iteration
I solved the issue by making a dictionary right away with c.rowfactory = makeDictFactory(c):
def makeDictFactory(cursor):
columnNames = [d[0] for d in cursor.description]
def createRow(*args):
return dict(zip(columnNames, args))
return createRow
def getAndConvertDataFromDatabase:(filename)
fd = open(filename, 'r')
sqlFile = fd.read()
fd.close()
commands = sqlFile.split(';')
for command in commands:
try:
c = conn.cursor()
c.execute(command)
c.rowfactory = makeDictFactory(c)
data = c.fetchall()
for batch in [data[x:x+100] for x in range(0, len(data), 100)]:
return postBody(json.dumps(batch,default = myconverter), dataList[filename])
except RuntimeError:
print('Error.')
I have to read this csv file into a list and I have declared type int for writing certain columns in list as integer type but this error has become a problem for me.
with open('new_toy_dataset.csv','r') as cf:
for row in cf:
toy_list.append([int(row[0]), row[1], row[2], int(row[3]), int(row[4]), row[5]])
Data Set
Error
The problem is that you are not skipping the header.
import csv
toy_list = []
ind = 0
with open('new_toy_dataset.csv','r') as cf:
reader = csv.reader(cf, delimiter=';') # whatever delimiter it is
for row in reader:
if ind == 0:
ind += 1
continue
toy_list.append([int(row[0]), row[1], row[2], int(row[3]), int(row[4]), row[5]])
I have a huge dataset that cannot be stored in memory so I prebatched it several files how do I make my dataset and data loader class such that load one bath at a time.
All the files have the same base name and a unique batch number an
Example file would be called o3_batch_1.hdf5 or o3_batch_2.hdf5 the
Largest batch number is o3_batch_102.hdf5
here is what I have tried so far:
would it work?
length would be the total length of the data.
batchNum would be the non-unique number at the end of the file.
base is the common name shared by the file.
class Data(Dataset):
# Constructor
def __init__(self, base, batchNum, length):
name = base + str(batchNum)
with h5py.File(name, "r") as f:
puzz = np.array(f.get('puzzle'))
sol = np.array(f.get('Sol'))
self.puzz = torch.from_numpy(puzz)
self.sol = torch.from_numpy(sol)
self.len = length
# Getter
def __getitem__(self, batchNum, index):
return self.puzz[index], self.sol[index]
# Get length
def __len__(self):
return self.len
I think you can iterate over the Index array, and you can get your data through iteration.
Suppose your file is organized in the following manner
/yourFileDir
o3_batch_1.hdf5
o3_batch_2.hdf5
...
o3_batch_102.hdf5
And your batch Index is 0,1,2,...,102
h5_dir = '/yourFileDir'
for Index in range(103):
with h5py.File(h5_dir + 'o3_batch_{}'.format(Index), 'r') as f:
puzz = np.array(f['puzzle'])
sol = np.array(f['Sol']) # this depends on how you save your data
Edit 12/07/19: The problem was not in fact with pd.rename fuction but the fact that I did not return from the function the pandas dataframe and as a result the column change did not exist when printing. i.e.
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
return as_pandas <- This was missing*
Please see the user comment below to uptick them for finding this error for me.
Alternatively, you can continue reading.
The data can be downloaded from this link, yet I have added a sample dataset. The formatting of the file is not a typical CSV file and I believe this may have been an assessment piece and is related to Hidden Decision Tree article. I have given the portion of the code as it solves the issues surrounding the format of the text file as mentioned above and allows the user to rename the column.
The problem occured when I tried to assign create a re-naming function:
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=)
However, it seem to work when I set the variable names inside rename function.
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
Sample Dataset
Title URL Date Unique Pageviews
oupUrl=tutorials 18-Apr-15 5608
"An Exclusive Interview with Data Expert, John Bottega" http://www.datasciencecentral.com/forum/topics/an-exclusive-interview-with-data-expert-john-bottega?groupUrl=announcements 10-Jun-14 360
Announcing Composable Analytics http://www.datasciencecentral.com/forum/topics/announcing-composable-analytics 15-Jun-14 367
Announcing the release of Spark 1.5 http://www.datasciencecentral.com/forum/topics/announcing-the-release-of-spark-1-5 12-Sep-15 156
Are Extreme Weather Events More Frequent? The Data Science Answer http://www.datasciencecentral.com/forum/topics/are-extreme-weather-events-more-frequent-the-data-science-answer 5-Oct-15 204
Are you interested in joining the University of California for an empiricalstudy on 'Big Data'? http://www.datasciencecentral.com/forum/topics/are-you-interested-in-joining-the-university-of-california-for-an 7-Feb-13 204
Are you smart enough to work at Google? http://www.datasciencecentral.com/forum/topics/are-you-smart-enough-to-work-at-google 11-Oct-15 3625
"As a software engineer, what's the best skill set to have for the next 5-10years?" http://www.datasciencecentral.com/forum/topics/as-a-software-engineer-what-s-the-best-skill-set-to-have-for-the- 12-Feb-16 2815
A Statistician's View on Big Data and Data Science (Updated) http://www.datasciencecentral.com/forum/topics/a-statistician-s-view-on-big-data-and-data-science-updated-1 21-May-14 163
A synthetic variance designed for Hadoop and big data http://www.datasciencecentral.com/forum/topics/a-synthetic-variance-designed-for-hadoop-and-big-data?groupUrl=research 26-May-14 575
A Tough Calculus Question http://www.datasciencecentral.com/forum/topics/a-tough-calculus-question 10-Feb-16 937
Attribution Modeling: Key Analytical Strategy to Boost Marketing ROI http://www.datasciencecentral.com/forum/topics/attribution-modeling-key-concept 24-Oct-15 937
Audience expansion http://www.datasciencecentral.com/forum/topics/audience-expansion 6-May-13 223
Automatic use of insights http://www.datasciencecentral.com/forum/topics/automatic-use-of-insights 27-Aug-15 122
Average length of dissertations by higher education discipline. http://www.datasciencecentral.com/forum/topics/average-length-of-dissertations-by-higher-education-discipline 4-Jun-15 1303
This is the full code that produces the Key Error:
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
def change_column_names(as_pandas, old_name, new_name):
as_pandas.rename(columns={old_name: new_name}, inplace=True)
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'},
inplace=True)
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
# change_column_names(multi_sets)
change_column_names(multi_set, 'Old_Name', 'New_Name')
print(multi_sets)
main()
I cleaned up your code so it would run. You were changing the column names but not returning the result. Try the following:
import pandas as pd
import numpy as np
import math
def set_new_columns(as_pandas):
titles_list = ['Year > 2014', 'Forum', 'Blog', 'Python', 'R',
'Machine_Learning', 'Data_Science', 'Data',
'Analytics']
for number, word in enumerate(titles_list):
as_pandas.insert(len(as_pandas.columns), titles_list[number], 0)
def title_length(as_pandas):
# Insert new column header then count the number of letters in 'Title'
as_pandas.insert(len(as_pandas.columns), 'Title_Length', 0)
as_pandas['Title_Length'] = as_pandas['Title'].map(str).apply(len)
# Although it is log, percentage of change is inverse linear comparison of
#logX1 - logX2
# therefore you could think of it as the percentage change in Page Views
# map
# function allows for function to be performed on all rows in column
# 'Page_Views'.
def log_page_view(as_pandas):
# Insert new column header
as_pandas.insert(len(as_pandas.columns), 'Log_Page_Views', 0)
as_pandas['Log_Page_Views'] = as_pandas['Page_Views'].map(lambda x: math.log(1 + float(x)))
def change_to_numeric(as_pandas):
# Check for missing values then convert the column to numeric.
as_pandas = as_pandas.replace(r'^\s*$', np.nan, regex=True)
as_pandas['Page_Views'] = pd.to_numeric(as_pandas['Page_Views'],
errors='coerce')
def change_column_names(as_pandas):
as_pandas.rename(columns={'Unique Pageviews': 'Page_Views'}, inplace=True)
return as_pandas
def open_as_dataframe(file_name_in):
reader = pd.read_csv(file_name_in, encoding='windows-1251')
return reader
# Get each column of data including the heading and separate each element
# i.e. Title, URL, Date, Page Views
# and save to string_of_rows with comma separator for storage as a csv
# file.
def get_columns_of_data(*args):
# Function that accept variable length arguments
string_of_rows = str()
num_cols = len(args)
try:
if num_cols > 0:
for number, element in enumerate(args):
if number == (num_cols - 1):
string_of_rows = string_of_rows + element + '\n'
else:
string_of_rows = string_of_rows + element + ','
except UnboundLocalError:
print('Empty file \'or\' No arguments received, cannot be zero')
return string_of_rows
def open_file(file_name):
import csv
try:
with open(file_name) as csv_file_in, open('HDT_data5.txt', 'w') as csv_file_out:
csv_read = csv.reader(csv_file_in, delimiter='\t')
for row in csv_read:
try:
row[0] = row[0].replace(',', '')
csv_file_out.write(get_columns_of_data(*row))
except TypeError:
continue
print("The file name '{}' was successfully opened and read".format(file_name))
except IOError:
print('File not found \'OR\' Not in current directory\n')
# All acronyms used in variable naming correspond to the function at time
# of return from function.
# csv_list being a list of the v file contents the remainder i.e. 'st' of
# csv_list_st = split_title().
def main():
open_file('HDTdata3.txt')
multi_sets = open_as_dataframe('HDT_data5.txt')
multi_sets = change_column_names(multi_sets)
change_to_numeric(multi_sets)
log_page_view(multi_sets)
title_length(multi_sets)
set_new_columns(multi_sets)
print(multi_sets)
main()