How to make my pandas script export to csv - python-3.x

I have managed to get the output I want from this script:
But I am having trouble exporting it to a csv using:
v.to_csv(n + '.csv', index=False)
I get this error:
Traceback (most recent call last): Python/CouponRedemptions/start.py", line 22, in <module> print(v['invoice_line_normal_price']) File "~/.local/lib/python3.8/site-packages/pandas/core/frame.py", line 2902, in getitem indexer = self.columns.get_loc(key) File "~/.local/lib/python3.8/site-packages/pandas/core/indexes/base.py", line 2893, in get_loc raise KeyError(key) from err KeyError: 'invoice_line_normal_price'
I think it is the way the DF is structured, you cannot export it in its current state. I was wondering how I would go about making this work or any suggestions on where I cant start looking.
import pandas as pd
import re
r = pd.read_csv('cp.csv', low_memory=False)
r = r.filter(['shop_name','order_coupon_code','invoice_line_type','invoice_date','invoice_line_normal_price'])
r = r[r.order_coupon_code.notnull()]
r['invoice_line_normal_price'] = pd.to_numeric(r['invoice_line_normal_price'],errors = 'coerce')
n = input("Enter the coupon name: ")
nr = r[r.order_coupon_code.str.match(n,flags=re.IGNORECASE)]
nr = nr[nr.invoice_line_type.str.match('charge')]
nr = nr.sort_values('shop_name')
v = nr.groupby(['shop_name'])['invoice_line_normal_price'].value_counts().to_frame('counts')
print(v)

example of csv code
shop_name order_coupon_code invoice_line_type invoice_date invoice_line_normal_price moresome moreother hello
0 shop1 nv55 sell 01.01.2016 01:00:00.000 15.0 3 tt hi
1 shop2 nv44 quote 01.01.2016 02:00:00.000 22.0 4 rr hey
2 shop3 nv22 charge 01.01.2016 03:00:00.000 27.0 5 dd what
mport pandas as pd
# The low_memory option is not properly deprecated, but it should be, since it does not actually do anything differently
r = pd.read_csv('cp.csv')
print(r)
# r = r.loc[:,['shop_name', 'order_coupon_code', 'invoice_line_type', 'invoice_date', 'invoice_line_normal_price']]
r = r.filter(['shop_name','order_coupon_code','invoice_line_type','invoice_date','invoice_line_normal_price'])
r = r[r.order_coupon_code.notnull()]
r['invoice_line_normal_price'] = pd.to_numeric(r['invoice_line_normal_price'],errors = 'coerce')
# Enter the coupon name: nv22
n = input("Enter the coupon name: ")
nr = r[r.order_coupon_code.str.contains(n.lower())]
nr = nr[nr.invoice_line_type.str.match('charge')]
nr = nr.sort_values('shop_name')
v = nr.groupby(['shop_name'])['invoice_line_normal_price'].value_counts().to_frame('counts')
print(v)
v.to_csv(n + '.csv', index=False)
the output
shop_name invoice_line_normal_price
shop3 27.0 1
let say you need to add more to Single csv file
v.to_csv(n + '.csv',mode='a', index=False)
no header
v.to_csv(n + '.csv',mode='a', index=False,header=False)
just to make sure this error mean the name of column is not in your csv file check out the column name on your csv file
get_loc raise KeyError(key) from err KeyError: 'invoice_line_normal_price'

Related

i am trying to calculate sentimental score of each syntax of csv file by using csv library

this is the error which i am getting. In the previous post i forget to put both function . In the first function i'm reading csv file and removing punctuation and send the string to second function to calculate the sentimentel score. this code give output for few row of csv file and then show this error i'm new in python
Traceback (most recent call last):
File "C:/Users/Public/Downloads/Hotelsurvey.py", line 116, in <module>
Countswordofeachsyntax()
File "C:/Users/Public/Downloads/Hotelsurvey.py", line 92, in Countswordofeachsyntax
print(findsentimentalscore(nopunct))
File "C:/Users/Public/Downloads/Hotelsurvey.py", line 111, in findsentimentalscore
ss =ss + weight
TypeError: unsupported operand type(s) for +: 'int' and 'list'
def Countswordofeachsyntax():
nopunct = ""
with open('dataset-CalheirosMoroRita-2017.csv', 'r') as csv_file:
csv_reader = csv.reader(csv_file, delimiter='|')
for sys in csv_reader:
for value in sys:
nopunct = ""
for ch in value:
if ch not in punctuation:
nopunct = nopunct + ch
print(findsentimentalscore(nopunct))
def findsentimentalscore(st):
ss = 0
count = len(st.split())
mycollapsedstring = ' '.join(st.split())
print(str(mycollapsedstring.split(' ')) + " := " + str(len(mycollapsedstring.split())))
for key, weight in keywords.items():
if key in mycollapsedstring.lower():
ss =ss + weight
#print(key, weight)
res = (ss / count * 100)
return math.ceil(res)

How to read desired rows from large CSV files in python

I am trying to search for data from a CSV file and pass on the data to another python code.
The CSV file has 100000+ rows and from them, I want to pass on the requested data per my choice.
Actual Code:
input_file = 'trusted.csv'
users = []
with open(input_file, encoding='UTF-8') as f:
rows = csv.reader(f,delimiter=",",lineterminator="\n")
next(rows, None)
for row in rows:
user = {}
user['username'] = row[0]
user['id'] = int(row[1])
user['access_hash'] = int(row[2])
user['name'] = row[3]
users.append(user)
Parsing data to code:
g_index = input("Enter a Number: ")
target_group=groups[int(g_index)]
target_group.access_hash
The Actual code will parse All the rows from the CSV file and I am trying to find a solution for a python code that can pass on data - like from 11 to 20 rows, 50 to 100 rows likewise.
I tried the below code but received an error when the data was parsed to another python code:
import CSV
input_file = 'lucky280.csv'
start = 10
stop = start + 10
users = []
with open(input_file, encoding='UTF-8') as f:
rows = csv.reader(f,delimiter=",",lineterminator="\n")
for i, line in enumerate(rows):
if i >= start:
users.append(line)
if i > stop:
break
for row in rows:
user = {}
user['username'] = row[0]
user['id'] = int(row[1])
user['access_hash'] = int(row[2])
user['name'] = row[3]
users.append(user)
ERROR :
Traceback (most recent call last):
File "", line 10, in
print ("Adding {}".format(user['id']))
TypeError: list indices must be integers or slices, not str
If I use the Actual code the file reading will work fine but it will parse all the data in the file.
Please HELP!
After recommendation I also tried
input_file = 'lucky280.csv'
users = []
from itertools import islice
with open(input_file, encoding='UTF-8') as f:
rows = csv.reader(f,delimiter=",",lineterminator="\n")
rowiter = islice(rows, 3, 5)
for item in rowiter:
for row in rows:
user = {}
user['username'] = row[0]
user['id'] = int(row[1])
user['access_hash'] = int(row[2])
user['name'] = row[3]
users.append(user)
got the below error
IndexError Traceback (most recent call last)
<ipython-input-108-9f4099c2e53d> in <module>()
10 user = {}
11 user['username'] = row[0]
---> 12 user['id'] = int(row[1])
13 user['access_hash'] = int(row[2])
14 user['name'] = row[3]
IndexError: list index out of range
You can use islice from itertools
So here i have as sample csv file
X Y
0 21 test3
1 8 test1
2 75 test1
3 26 test2
4 98 test3
5 63 test3
6 65 test3
7 39 test3
8 74 test1
9 26 test2
And suppose I want only rows 3 and 4
>>> from itertools import islice
>>> with open('test.csv') as f:
... rows = csv.reader(f)
... rowiter = islice(rows, 3, 5)
... for item in rowiter:
... print(item)
gives me the following output
['2', '75', 'test1']
['3', '26', 'test2']
Update
input_file = 'trusted.csv'
start = 10
stop = start + 10
users = []
with open(input_file, encoding='UTF-8') as f:
rows = csv.reader(f,delimiter=",",lineterminator="\n")
rowiter = islice(rows, start, stop)
for row in rowiter :
user = {}
user['username'] = row[0]
user['id'] = int(row[1])
user['access_hash'] = int(row[2])
user['name'] = row[3]
users.append(user)

Python: How to import arules package from R using Rpy2

I am trying to use python with some nice functions in R. In particular I want to use read.transactions function which is found in one of the packages in R (arules)
I did the following steps
1- Open Anaconda and lunch R studio
In R studio
2- install.packages('arules', dep = TRUE)
3- loadNamespace('arules')
4- .libPaths()
Got
[1] "D:/Anaconda3/Lib/site-packages/rpy2/R/win-library/3.4"
[2] "C:/Program Files/R/R-3.4.4/library"
Now I go to jupyter notebook
In Jupyter Notebook
import rpy2
import rpy2.robjects as RObjects
from rpy2.robjects.packages import importr
utils = importr("utils")
d = {'print.me': 'print_dot_me', 'print_me': 'print_uscore_me'}
try:
arules = importr('arules', robject_translations = d, lib_loc = "D:/Anaconda3/Lib/site-packages/rpy2/R/win-library/3.4")
except:
arules = importr('arules', robject_translations = d, lib_loc = "C:/Program Files/R/R-3.4.4/library")
The Outcome was
---------------------------------------------------------------------------
RRuntimeError Traceback (most recent call last)
<ipython-input-3-5df30d28440c> in <module>()
3 try:
----> 4 arules = importr('arules', robject_translations = d, lib_loc = "D:/Anaconda3/Lib/site-packages/rpy2/R/win-library/3.4")
5 except:
~\Anaconda3\lib\site-packages\rpy2\robjects\packages.py in importr(name, lib_loc, robject_translations, signature_translation, suppress_messages, on_conflict, symbol_r2python, symbol_check_after, data)
452 _system_file(package = rname)):
--> 453 env = _get_namespace(rname)
454 version = _get_namespace_version(rname)[0]
RRuntimeError: Error in loadNamespace(name) : there is no package called 'arules'
During handling of the above exception, another exception occurred:
RRuntimeError Traceback (most recent call last)
<ipython-input-3-5df30d28440c> in <module>()
4 arules = importr('arules', robject_translations = d, lib_loc = "D:/Anaconda3/Lib/site-packages/rpy2/R/win-library/3.4")
5 except:
----> 6 arules = importr('arules', robject_translations = d, lib_loc = "C:/Program Files/R/R-3.4.4/library")
7
~\Anaconda3\lib\site-packages\rpy2\robjects\packages.py in importr(name, lib_loc, robject_translations, signature_translation, suppress_messages, on_conflict, symbol_r2python, symbol_check_after, data)
451 if _package_has_namespace(rname,
452 _system_file(package = rname)):
--> 453 env = _get_namespace(rname)
454 version = _get_namespace_version(rname)[0]
455 exported_names = set(_get_namespace_exports(rname))
RRuntimeError: Error in loadNamespace(name) : there is no package called 'arules'
Which was not able to import the R package to Python
I did the same with DirichletReg and it was successful. I do not know why.
Can anyone help me with this?
importr looks into R_HOME directory for the installed R packages. I assume, arules package was not added in the library folder of R_HOME instead it is added in some other location let's say 'C:\Users\User_name\Documents\R\win-library\3.x.x' which might be causing the issue.
If that is the case, copy arules folder from that specific location and add into library folder of R_HOME directory. Try this approach and see whether you are able to come out of the problem.
Now to the last of the discovery, there is nothing like that in python, however, there is a way out to use read.transactions
groceries <- read.transactions("groceries.csv", sep = ",")
> summary(groceries)
transactions as itemMatrix in sparse format with
9835 rows (elements/itemsets/transactions) and
169 columns (items) and a density of 0.02609146
Python Jupyter notebook
1) Import the data as
import requests
url = 'https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/groceries.csv'
grocery_dataset = requests.get(url)
# Save string as txt file
f = open('grocery_dataset.txt','w')
f.write(grocery_dataset.text)
f.close()
2) Separate the data and adjust them as you wish
import csv
grocery_items = set()
with open("grocery_dataset.txt") as f:
reader = csv.reader(f, delimiter=",")
for i, line in enumerate(reader):
grocery_items.update(line)
output_list = list()
with open("grocery_dataset.txt") as f:
reader = csv.reader(f, delimiter=",")
for i, line in enumerate(reader):
row_val = {item:0 for item in grocery_items}
row_val.update({item:1 for item in line})
output_list.append(row_val)
4) save it as a Dataframe in python
import pandas as pd
grocery_df = pd.DataFrame(output_list)
hence
grocery_df.shape
will give
(9835, 169)
which represent that of the rows and columns of the summary(groceries) in R

How to fix the code about appending a number in the line?

I create a new column (name:Account) in the csv, then try to make a sequence (c = float(a) + float(b)) and for each number in sequence append to the original line in the csv, which is the value of the new column. Here is my code:
# -*- coding: utf-8 -*-
import csv
with open('./tradedate/2007date.csv') as inf:
reader = csv.reader(inf)
all = []
row = next(reader)
row.append('Amount')
all.append(row)
a =50
for i, line in enumerate(inf):
if i != 0:
size = sum(1 for _ in inf) # count the line number
for b in range(1, size+1):
c = float(a) + float(b) # create the sequence: in 1st line add 1, 2nd line add 2, 3rd line add 3...etc
line.append(c) # this is the error message: AttributeError: 'str' object has no attribute 'append'
all.append(line)
with open('main_test.csv', 'w', newline = '') as new_csv:
csv_writer = csv.writer(new_csv)
csv_writer.writerows(all)
The csv is like this:
日期,成交股數,成交金額,成交筆數,發行量加權股價指數,漲跌點數,Account
96/01/02,"5,738,692,838","141,743,085,172","1,093,711","7,920.80",97.08,51
96/01/03,"5,974,259,385","160,945,755,016","1,160,347","7,917.30",-3.50,52
96/01/04,"5,747,756,529","158,857,947,106","1,131,747","7,934.51",17.21,53
96/01/05,"5,202,769,867","143,781,214,318","1,046,480","7,835.57",-98.94,54
96/01/08,"4,314,344,739","115,425,522,734","888,324","7,736.71",-98.86,55
96/01/09,"4,533,381,664","120,582,511,893","905,970","7,790.01",53.30,56
The Error message is:
Traceback (most recent call last):
File "main.py", line 21, in <module>
line.append(c)
AttributeError: 'str' object has no attribute 'append'
Very thanks for any help!!
I'm a little confused why you're structuring your code this way, but the simplest fix would be to change the append (since you can't append to a string) to += a string version of c, i.e.
line += str(c)
or
line += ',{}'.format(c)
(I'm not clear based on how you're written this if you need the comma or not)
The biggest problem is that you're not using your csv reader - below is a better implementation. With the csv reader it's cleaner to do the append that you want to do versus using the file object directly.
import csv
with open('./tradedate/2007date.csv') as old_csv:
with open('main_test.csv', 'w') as new_csv:
writer = csv.writer(new_csv, lineterminator='\n')
reader = csv.reader(old_csv)
all = []
row = next(reader)
row.append('Line Number')
all.append(row)
line_number = 51
for row in reader:
row.append(line_number)
all.append(row)
line_number += 1
writer.writerows(all)

how to convert multiple csv files to multiple tables in sqlite using python3?

I was trying to import multiple csv files into sqlite database into multiple tables(using jupyter notebook in python3). The name of each file will be the name of the table. I have defined a function to covert the encoding to utf8 as below:
import sqlite3
import glob
import csv
import sys
def convert_to_utf8(dirname):
for filename in glob.glob(os.path.join(dirname, '*.csv')):
ifp = open(filename, "rt", encoding='cp1252')
input_data = ifp.read()
ifp.close()
ofp = open(filename + ".fix", "wt", encoding='utf-8')
for c in input_data:
if c != '\0':
ofp.write(c)
ofp.close()
return
all the files are in the same folder. staging_dir_name_1 is where the files are. And I have below code to covert the csv file into tables, some of the codes are from similar questions in StackFlow:
convert_to_utf8(staging_dir_name_1)
conn = sqlite3.connect("medicare_hospital_compare_1.db")
c = conn.cursor()
for filename in glob.glob(os.path.join(staging_dir_name_1, '*.csv')):
with open(filename, "rb") as f:
data = csv.DictReader(f)
cols = data.fieldnames
tablename = os.path.splitext(os.path.basename(filename))[0]
sql_str = "drop table if exists %s" % tablename
c.execute(sql_str)
sql_str = "create table if not exists %s (%s)" % (tablename, ','.join(["%s text" % col for col in cols]))
c.execute(sql_str)
sql_str = "insert into %s values (%s)" % (tablename, ','.join(["?" for col in cols]))
c.executemany(sql_str, (list(map(row.get, cols)) for row in data))
conn.commit()
but when i run this i get this error
> Error Traceback (most recent call
> last) <ipython-input-29-be7c1f43e4c5> in <module>()
> 2 with open(filename, "rb") as f:
> 3 data = csv.DictReader(f)
> ----> 4 cols = data.fieldnames
> 5 tablename = os.path.splitext(os.path.basename(filename))[0]
> 6
>
> C:\Users\dupin\Anaconda3\lib\csv.py in fieldnames(self)
> 96 if self._fieldnames is None:
> 97 try:
> ---> 98 self._fieldnames = next(self.reader)
> 99 except StopIteration:
> 100 pass
>
> Error: iterator should return strings, not bytes (did you open the
> file in text mode?)
Could anyone help me on how to resolve this issue? I have been thinking about it for a while but still couldn't figure out how to resolve this.
**===UPDATE===**
Now i have changed 'rb' to 'rt', i got a new error full NULL values, i think the first function has already removed all the null values
Error Traceback (most recent call last)
<ipython-input-77-68d56c0b4cf2> in <module>()
3
4 data = csv.DictReader(f)
----> 5 cols = data.fieldnames
6 table = os.path.splitext(os.path.basename(filename))[0]
7
C:\Users\dupin\Anaconda3\lib\csv.py in fieldnames(self)
96 if self._fieldnames is None:
97 try:
---> 98 self._fieldnames = next(self.reader)
99 except StopIteration:
100 pass
Error: line contains NULL byte

Resources