Read File and count the words in column in python - python-3.x

I'm new in python and I need some help on read the file and count the word in column.
I have 2 data file, which is category.csv and data.csv.
category.csv:
CATEGORY
Technology
Furniture
Office Supplies
and below is data.csv
CATEGORY
Technology
Furniture
Technology
Furniture
Office Supplies
First, I want to select the 'Technology' in category.csv and match it with data.cvs, after that, it will start to count 'Technology' appears how many times in data.cvs.
import csv # import csv file
filePath1 = "category.csv"
filePath2 = "data.csv"
with open(filePath1) as csvfile1: # open category file
with open(filePath2) as csvfile2: # open data file
reader1 = csv.DictReader(csvfile1) # dictread file
reader2 = csv.DictReader(csvfile2) # dictread file
for row1 in reader1: # read all row in data file
for row2 in reader2:
for row1['CATEGORY'] in row2['CATEGORY']:
total_tech = row2['CATEGORY'].count('Technology')
total_furn = row2['CATEGORY'].count('Furniture')
total_offi = row2['CATEGORY'].count('Office Supplies')
print("=============================================================================")
print("Display category average stock level")
print("=============================================================================")
print( "Technology :", total_tech)
print("Furniture :", total_furn)
print("Office Supplies :", total_offi)
print( "=============================================================================")
But i'm failed to count it with above code, can somebody help me ? Thank you so much.

Here is the solution -
import csv # import csv file
filePath1 = "category.csv"
filePath2 = "data.csv"
categories = {}
with open(filePath1) as csvfile: # open category file
reader = csv.DictReader(csvfile) # dictread file
for row in reader: # Create a dictionary map of all the categories, and initialise count to 0
categories[row["CATEGORY"]] = 0
with open(filePath2) as csvfile: # open data file
reader = csv.DictReader(csvfile) # dictread file
for row in reader:
categories[row["CATEGORY"]] += 1 # For every item in data file, increment the count of the category
print("=============================================================================")
print("Display category average stock level")
print("=============================================================================")
for key, value in categories.items():
print("{:<20} :{:>4}".format(key, value))
print("=============================================================================")
The output is like this -
=============================================================================
Display category average stock level
=============================================================================
Technology : 2
Office Supplies : 1
Furniture : 2
=============================================================================

Related

How can I find out which row of data from this excel sheet is duplicated the most

I am trying to find out which row (street name) has the most crimes in an excel spreadsheet. I have found the sum for the highest amount of crimes I just can't find the actual row that generated that many occurrences.
import os
import csv
def main():
#create and save the path to the file...
fileToRead = "C:/Users/zacoli4407/Documents/Intro_To_Scipting/Crime_Data_Set.csv"
highestNumberOfCrimes = 0
data = []
rowNumber = 0
count = 0
with open(fileToRead, 'r') as dataToRead:
dataToRead = open(fileToRead, 'r') # open the access to the file
reader = csv.reader(dataToRead) # gives the ability to read from the file
for row in reader:
if row[4].isnumeric():
if int(row[4]) > highestNumberOfCrimes:
highestNumberOfCrimes = int(row[4])
rowNumber = count
data.append([row[2],row[3],row[4],row[5]]) #row 3 has the street name I am trying to acquire
count += 1
print(highestNumberOfCrimes)
with open("crime.txt", "w") as outputFile:
outputFile.write("The highest number of crimes is: \n")
outputFile.write(str(highestNumberOfCrimes))
main()
You could do the following:
import csv
from collections import defaultdict
result = defaultdict(float)
with open(fileToRead, 'r') as dataToRead:
reader = csv.reader(dataToRead)
header = next(reader)
for row in reader:
result[row[3]] += float(row[4])
#Now to get the street with maximum number of crimes
mx = max(result, key = result.get)
print(mx)
#to get the maximum number of crimes
print(result[mx])

How to read with Pandas txt file with column names in each row

I'm begginer with Python, and I need to read a txt file where the column name is on each row, the columns are dissordered and not all columns are informed. Are there any way to read this kind of file with Pandas?
This is a example (3 rows):
pepe01#mail.com:{ssha}fiy9XI6d:created="1575487257" fwd="" spf_block="" quota="1024mb" full_name="Full Name" mailaccess="envia" mailstatus="cancelled"
pepe02#mail.com:{ssha}Q0H90Rf9:created="1305323967" mailaccess="1" mailstatus="active" admin_access="" quota="" expire="0" full_name="Full Name" pais="CO"
pepe03#mail.com:{ssha}sCPC3HOE:created="1550680636" fwd="" pass_question="" pass_answer="" disabled="Y" mailstatus="cancelled" full_name="Name"
You can use re module to parse the file.
For example:
import re
import pandas as pd
all_data = []
with open('<YOUR FILE>', 'r') as f_in:
for line in f_in:
m = re.search(r'^(.*?):(.*?):', line)
if not m:
continue
data = dict(re.findall(r'([^\s]+)="([^"]+)"', line.split(':', maxsplit=2)[-1]))
data['mail'] = m.group(1)
data['password'] = m.group(2)
all_data.append(data)
df = pd.DataFrame(all_data).fillna('')
print(df)
Prints the dataframe:
created quota full_name mailaccess mailstatus mail password expire pais disabled
0 1575487257 1024mb Full Name envia cancelled pepe01#mail.com {ssha}fiy9XI6d
1 1305323967 Full Name 1 active pepe02#mail.com {ssha}Q0H90Rf9 0 CO
2 1550680636 Name cancelled pepe03#mail.com {ssha}sCPC3HOE Y

Python: How to obtain desired list?

I'm trying to learn Spark so I'm totally new to it.
I have a file with thousands of lines where each one is structured like:
LFPG;EDDW;00;E170;370;LFPG;EDDW;189930555;150907;1826;!!!!;AFR1724;AFR;0;AFR1724-LFPG-EDDW-20150907180000;N;0;;;245382;;;150907;1800;0;;X;;;;;;;;;;370;;0;20150907175700;AA45458743;;;;;NEXE;NEXE;;;;;20150907180000;;;;245382;;;;;;;;;;;;;;;;;;;;;;;;;;;;AFR;;;;;;;;;;;0
The above line represents flight information from an airplane, it took off from LFPG (1st element) and landed in EDDW (2nd element), the rest of the information is not relevant for the purpose.
I'd like to print or save in a file the top ten busiest airports based on the total number of aircraft movements, that is, airplanes that took off or landed in an airport.
So in a sense, the desired output would be:
AIRPORT_NAME #TOTAL_MOVEMENTS #TAKE-OFFs #LANDINGS
I have already implement this program in python and would like to transform it using the MAP/Reduce paradigm using Spark.
# Libraries
import sys
from collections import Counter
import collections
from itertools import chain
from collections import defaultdict
# START
# Defining default program argument
if len(sys.argv)==1:
fileName = "airports.exp2"
else:
fileName = sys.argv[1]
takeOffAirport = []
landingAirport = []
# Reading file
lines = 0 # Counter for file lines
try:
with open(fileName) as file:
for line in file:
words = line.split(';')
# Relevant data, item1 and item2 from each file line
origin = words[0]
destination = words[1]
# Populating lists
landingAirport.append(destination)
takeOffAirport.append(origin)
except IOError:
print ("\n\033[0;31mIoError: could not open the file:\033[00m %s" %fileName)
airports_dict = defaultdict(list)
# Merge lists into a dictionary key:value
for key, value in chain(Counter(takeOffAirport).items(),
Counter(landingAirport).items()):
# 'AIRPOT_NAME':[num_takeOffs, num_landings]
airports_dict[key].append(value)
# Sum key values and add it as another value
for key, value in airports_dict.items():
#'AIRPOT_NAME':[num_totalMovements, num_takeOffs, num_landings]
airports_dict[key] = [sum(value),value]
# Sort dictionary by the top 10 total movements
airports_dict = sorted(airports_dict.items(),
key=lambda kv:kv[1], reverse=True)[:10]
airports_dict = collections.OrderedDict(airports_dict)
# Print results
print("\nAIRPORT"+ "\t\t#TOTAL_MOVEMENTS"+ "\t#TAKEOFFS"+ "\t#LANDINGS")
for k in airports_dict:
print(k,"\t\t", airports_dict[k][0],
"\t\t\t", airports_dict[k][1][1],
"\t\t", airports_dict[k][1][0])
A test file can be download from: https://srv-file7.gofile.io/download/YCnWxr/traffic1day.exp2
So far I've been able to get the very first and second elements from the file, but I don't know quite well how to implement the filter or reduce in order to obtain the frequency time that each airports appears on each list and then merge both list adding the airport name, the sum of takeOffs and landings and the number of takeoffs and landings.
from pyspark import SparkContext, SparkConf
if __name__ == "__main__":
conf = SparkConf().setAppName("airports").setMaster("local[*]")
sc = SparkContext(conf = conf)
airports = sc.textFile("traffic1hour.exp2", minPartitions=4)
airports = airports.map(lambda line : line.split('\n'))
takeOff_airports = airports.map(lambda sub: (sub[0].split(';')[0]))
landing_airports = airports.map(lambda sub: (sub[0].split(';')[1]))
takeOff_airports.saveAsTextFile("takeOff_airports.txt")
landing_airports.saveAsTextFile("landing_airport.txt")
Any hint or guide it will be much appreciated.

Python ingestion of csv files

I am trying to ingest daily csv data into Python. I have different files such as follows for each day.I need help in appending two columns where the values from the columns are from the file name, for eg first column should take the value before '_' and the second column takes the date part from the file name.
board_2019-08-08.csv
sign_2019-08-08.csv
Summary_2019-08-08.csv
Code :
path = "C:\xyz\Files\ETL\Dashboard"
all_files = glob.glob(os.path.join(path, "*.csv"))
for file in all_files:
file_name = os.path.splitext(os.path.basename(file))[0]
dfn = pd.read_csv(file, skiprows = 17)
dfn['Page'] = 'Dashboard'
del dfn['Dimension']
dfn = dfn.iloc[1:]
dfn.columns = ['LoanId', 'Impressions', 'Page']
`
Try this
path = "C:\xyz\Files\ETL\Dashboard"
files = list(filter(lambda x: '.csv' in x, os.listdir('path')))
for file in files:
pre,post = file.split("_")
post = post.split(".")[0]
dfn = pd.read_csv(f"{path}/{file}", skiprows = 17)
# assume your inital values for column 0 and 1 is 1
dfn.insert(0,"column1",value=pre)
dfn.insert(1,"column2",value=post)
// rest of your code

I want to merge the rows for a particular values in csv file

I have a csv file which is structured like that. What I want to achieve is to merge colors .like for product code 1001 there are different colors, i.e BLACK CREAM GRAPHITE, I want one row for 1001 and all colors in one cell ";" (semi colon) separated.I want to do it for all products.
EDIT
Requried Output:
1001-BLACK-P-OS ,BLACK;CREAM;Graphite
1002-BLACK-P-OS ,BLACK;CREAM
Given CSV
1001-BLACK-P-OS , BLACK
1001-CREAM-P-OS , CREAM
1001-GRAPH-P-OS , GRAPHITE
1002-BLACK-P-OS ,BLACK
1002-CREAM-P-OS ,CREAM
I am trying on python but not able to do it.
with open('ascolor.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
for row in readCSV:
serial=row[0]
d=''
for r in readCSV:
if serial is r[0]:
d=d+r[1]
d=d+';'
Create your data file:
data = """1001-BLACK-P-OS , BLACK
1001-CREAM-P-OS , CREAM
1001-GRAPH-P-OS , GRAPHITE
1002-BLACK-P-OS ,BLACK
1002-CREAM-P-OS ,CREAM"""
fn = 'ascolor.csv'
with open(fn, "w") as f:
f.write(data)
with that we can start reformatting it:
fn = 'ascolor.csv'
import csv
data = {}
with open(fn) as csvfile:
readCSV = csv.reader(csvfile, delimiter=',')
for row in readCSV:
if row: # weed out any empty rows - they would cause index errors
num = row[0].split("-")[0] # use only the number as key into our dict
d = data.setdefault(num,[row[0].strip()]) # create the default entry with num as key
# and the old "1001-BLACK-P-OS text as first entry
if len(d) == 1: # first time we add smth
d.append([row[1].strip()]) # now add the first color into an inner list
else: # this is the second/third color for this key, append to inner list
d[1].append(row[1].strip()) # this is kindof inefficient string concat
# after that youve got a dictionary of your data:
# print(data)
# {'1001': ['1001-BLACK-P-OS', ['BLACK', 'CREAM', 'GRAPHITE']],
# '1002': ['1002-BLACK-P-OS', ['BLACK', 'CREAM']]}
# when writing csv with module, always open file with newline = ""
# else you get silly empty lines inside your file. module csv will do
# all newlines needed. See example at
# https://docs.python.org/3/library/csv.html#csv.writer
with open("done.csv","w",newline="") as f:
writer = csv.writer(f,delimiter=",")
for k in sorted(data.keys()):
# this will add the 1001-BLACK-P-OS before it - I dont like that
# writer.writerow([data[k][0],';'.join(data[k][1])])
# I like this better - its just 1001 and then the colors
writer.writerow([k,';'.join(data[k][1])])
print("")
with open("done.csv","r") as f:
print(f.read())
Output:
1001,BLACK;CREAM;GRAPHITE
1002,BLACK;CREAM
or with the commented line:
1001-BLACK-P-OS,BLACK;CREAM;GRAPHITE
1002-BLACK-P-OS,BLACK;CREAM
HTH

Resources