For loop into a pandas dataframe - python-3.x

I have the following piece of code and it works but prints out data as it should. I'm trying (unsuccessfully) to putting the results into a dataframe so I can export the results to a csv file.
I am looping through a json file and the results are correct, I just need two columns that print out to go into a dataframe instead of printing the results. I took out the code that was causing the error so it will run.
import json
import requests
import re
import pandas as pd
data = {}
df = pd.DataFrame(columns=['subtechnique', 'name'])
df
RE_FOR_SUB_TECHNIQUE = r"(T\d+)\.(\d+)"
r = requests.get('https://raw.githubusercontent.com/mitre/cti/master/enterprise-attack/enterprise-attack.json', verify=False)
data = r.json()
objects = data['objects']
for obj in objects:
ext_ref = obj.get('external_references',[])
revoked = obj.get('revoked') or '*****'
subtechnique = obj.get('x_mitre_is_subtechnique')
name = obj.get('name')
for ref in ext_ref:
ext_id = ref.get('external_id') or ''
if ext_id:
re_match = re.match(RE_FOR_SUB_TECHNIQUE, ext_id)
if re_match:
technique = re_match.group(1)
sub_technique = re_match.group(2)
print('{},{}'.format(technique+'.'+sub_technique, name))
Unless there is an easier way to put the results of each row in the loop and have that append to a csv file.
Any help is appreciated.
Thanks

In this instance, it's likely easier to just write the csv file directly, rather than go through Pandas:
with open("enterprise_attack.csv", "w") as f:
my_writer = csv.writer(f)
for obj in objects:
ext_ref = obj.get('external_references',[])
revoked = obj.get('revoked') or '*****'
subtechnique = obj.get('x_mitre_is_subtechnique')
name = obj.get('name')
for ref in ext_ref:
ext_id = ref.get('external_id') or ''
if ext_id:
re_match = re.match(RE_FOR_SUB_TECHNIQUE, ext_id)
if re_match:
technique = re_match.group(1)
sub_technique = re_match.group(2)
print('{},{}'.format(technique+'.'+sub_technique, name))
my_writer.writerow([technique+"."+sub_technique, name])
It should be noted that the above will overwrite the output of any previous runs. If you wish to keep the output of multiple runs, change the file mode to "a":
with open("enterprise_attack.csv", "a") as f:

Related

Python 3: How to combine values from a csv and append them to a new csv depending on a key value

I have a .csv table that looks like this:
original csv
I want to get a new .csv data that looks like this:
new csv
I already got to the point that I have the second csv with the unique values of the SITENAMES in the first column, but now I'm struggling to append the SPECIESNAMES into the second column.
uri = 'file:///C:/Users/t/Desktop/T/Natura/Python/20220214_Natura2000_specieslist.txt'
csvLyr = QgsVectorLayer(uri, "csvLayer", "delimitedtext")
spalten = ["SITECODE"]
sitecodes = pd.read_csv(uri, usecols=spalten)
spalten2 = ["SPECIESNAME_deutsch"]
species = pd.read_csv(uri, usecols=spalten2)
#### Schritt 2: Mithilfe von unique() die unique values der Sidecodes erhalten und als neue Spalte in eine csv schreiben
sitecodes_unique = sitecodes.SITECODE.unique()
print(sitecodes_unique)
print(len(sitecodes_unique))
path = 'C:/Users/t/Desktop/T/Natura/Python/Ergebnisse'
if not os.path.isdir(path):
os.makedirs(path)
with open('C:/Users/t/Desktop/T/Natura/Python/Ergebnisse/sitecodes_namen.csv', 'w+', newline='') as f:
wr = csv.writer(f)
for line in sitecodes_unique:
sitecodes_unique_split = line.split(',')
wr.writerow(sitecodes_unique_split)
Try this natural python code a viable alternative which calls directly a csv file instead of txt. I've tried to use collections as mentioned by #JonSG :
sitecodes = pd.read_csv('file:///C:/Users/t/Desktop/T/Natura/Python/20220214_Natura2000_specieslist.csv', index_col= False)
sitecodes_df = pd.DataFrame(sitecodes,columns = sitecodes.columns)
sitecodes_namen = defaultdict(list)
for i in range(len(sitecodes_df)):
if sitecodes_df['SITECODE'][i] in sitecodes_namen.keys():
sitecodes_namen[sitecodes_df['SITECODE'][i]]+=','+sitecodes_df['SPECIESNAME_deutsch'][i]
else:
sitecodes_namen[sitecodes_df['SITECODE'][i]] = sitecodes_df['SPECIESNAME_deutsch'][i]
df = pd. DataFrame(list(sitecodes_namen.items()), columns = sitecodes.columns)
df.to_csv('C:/Users/t/Desktop/T/Natura/Python/Ergebnisse/sitecodes_namen.csv',index=False)

Converting multiple .pdf files with multiple pages into 1 single .csv file

I am trying to convert .pdf data to a spreadsheet. Based on some research, some guys recommended transforming it into csv first in order to avoid errors.
So, I made the below coding which is giving me:
"TypeError: cannot concatenate object of type ''; only Series and DataFrame objs are valid"
Error appears at 'pd.concat' command.
'''
import tabula
import pandas as pd
import glob
path = r'C:\Users\REC.AC'
all_files = glob.glob(path + "/*.pdf")
print (all_files)
df = pd.concat(tabula.read_pdf(f1) for f1 in all_files)
df.to_csv("output.csv", index = False)
'''
Since this might be a common issue, I am posting the solution I found.
"""
df = []
for f1 in all_files:
df = pd.concat(tabula.read_pdf(f1))
"""
I believe that breaking the item iteration in two parts would generate the dataframe it needed and therefore would work.

Compare 2 CSV files (encoded = "utf8") keeping data format

I have 2 stock lists (New and Old). How can I compare it to see what items have been added and what had been removed (happy to add them to 2 different files added and removed)?
so far I have tired along the lines of looking row by row.
import csv
new = "new.csv"
old = "old.csv"
add_file = "add.csv"
remove_file = "remove.csv"
with open(new,encoding="utf8") as new_read, open(old,encoding="utf8") as old_read:
new_reader = csv.DictReader(new_read)
old_reader = csv.DictReader(old_read)
for new_row in new_reader :
for old_row in old_reader:
if old_row["STOCK CODE"] == new_row["STOCK CODE"]:
print("found")
This works for 1 item. if I add an *else: * it just keeps printing that until its found. So it's not an accurate way of comparing the files.
I have 5k worth of rows.
There must be a better way to add the differences to the 2 different files and keep the same data structure at the same time ?
N.B i have tired this link Python : Compare two csv files and print out differences
2 minor issues:
1. the data structure is not kept
2. there is not reference to the change of location
You could just read the data into memory and then compare.
I used sets for the codes in this example for faster lookup.
import csv
def get_csv_data(file_name):
data = []
codes = set()
with open(file_name, encoding="utf8") as csv_file:
reader = csv.DictReader(csv_file)
for row in reader:
data.append(row)
codes.add(row['STOCK CODE'])
return data, codes
def write_csv(file_name, data, codes):
with open(file_name, 'w', encoding="utf8", newline='') as csv_file:
headers = list(data[0].keys())
writer = csv.DictWriter(csv_file, fieldnames=headers)
writer.writeheader()
for row in data:
if row['STOCK CODE'] not in codes:
writer.writerow(row)
new_data, new_codes = get_csv_data('new.csv')
old_data, old_codes = get_csv_data('old.csv')
write_csv('add.csv', new_data, old_codes)
write_csv('remove.csv', old_data, new_codes)

Python try and except logic when returning data from an API

I am trying to use an API to store data into a CSV file.
I am querying the API / loading the data using the following:
def load_data(id):
with urlopen('url' + str(id)) as response:
source = response.read()
data = json.loads(source)
return data
Which is retuning a dict like:
{'name': 'Blah',
'address_1':'Street',
'address_2':'Town',
'website':'www.blah.com'}
I am then trying to iterate through a list of target id numbers to retrieve the data like so:
for x in targets:
data = load_data(x)
try:
data = load_data(x)
name = data['name']
address_1 = data['postalAddressLine1']
address_2 = data['postalAddressLine2']
website = data['website']
except KeyError as e:
pass
with open('test.csv', 'w', newline='') as csvfile:
# Declaring the writer
data_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
# Writing the headers
data_writer.writerow(['name', 'address_1', 'address_2', 'website'])
# Writing the data
data_writer.writerow([name, address_1, address_2, website])
The problem I am having is that a data point is missing on some of the iterations, e.g. on loop 2 there is no website, which is causing KeyError and therefore crashing the code - so I added in the try and except to catch this.
But now it seems that I am only returning data for the ids which have all of the above data points.
What I would like to do is return all of the data possible and ignore/fill in blank values where there is a KeyError
So I am wondering is my logic set up correctly and how can I achieve the above goal?
Please let me know if this is not worded very well!
Edit
My code wasn't writing each row of data as I had the writer in the wrong part of the loop. Updated code with the write structure and Roland Smith's answer to handle missing value.
empty_value = 'TBC'
with open('test.csv', 'w', newline='') as csvfile:
# Declaring the writer
data_writer = csv.writer(csvfile, quoting=csv.QUOTE_ALL)
# Writing the headers
data_writer.writerow(['name', 'address_1', 'address_2', 'website'])
for x in targets:
data = load_data(x)
try:
name = data.get('name', empty_value)
address_1 = data.get('postalAddressLine1', empty_value)
address_2 = data.get('postalAddressLine2', empty_value)
website = data.get('website', empty_value)
# Writing the data
data_writer.writerow([name, address_1, address_2, website])
except KeyError as e:
print(e)
pass
What I would suggest is to add missing keys manually:
required = ('name', 'address_1', 'address_2', 'website')
data = load_data(x)
for key in required:
if key not in data:
data[key] = 'not available'
Now your data at least contains all the keys you expect.
Alternatively, you could use the default argument of the get method:
ds = 'not available'
name = data.get('name', default=ds)
address_1 = data.get('address_1', default=ds)
address_2 = data.get('address_2', default=ds)
website = data.get('website', default=ds)

Python CSV not writing data to file

I am running into a wall with this. I am new to writing CSV files with python and have been reading lots of different posts on the topic, but now I ran into a wall with this and could use a little help.
import csv
#headers from the read.csv file that I wan't to parse and write to the new file.
headers = ['header1', 'header5', 'header6', 'header7']
#open the write.csv file to write the data to
with open("write.csv", 'wb') as csvWriter:
writer = csv.writer(csvWriter)
#open the main data file that I want to parse data out of and write to write.csv
with open('reading.csv') as csvfile:
readCSV = csv.reader(csvfile, delimiter=',' )
csvList = list(readCSV)
#finds where the position of the data I want to pull out and write to write.csv
itemCode = csvList[0].index(headers[0])
vendorName = csvList[0].index(headers[1])
supplierID = csvList[0].index(headers[2])
supplierItemCode = csvList[0].index(headers[3])
for row in readCSV:
writer.writerow([row[itemCode], row[vendorName], row[supplierID], row[supplierItemCode]])
csvWriter.close()
---UPDATE---
I made the changes suggested and tried commenting out the following part of the code & changing 'wb' to 'w' and the program worked. However, I don't understand why, and how do I set this up so that I can list the header I want to pull out?
csvList = list(readCSV)
itemCode = csvList[0].index(headers[0])
vendorName = csvList[0].index(headers[1])
supplierID = csvList[0].index(headers[2])
supplierItemCode = csvList[0].index(headers[3])
Here is my updated code:
headers = ['header1', 'header5', 'header6', 'header7']
#open the write.csv file to write the data to
with open("write.csv", 'wb') as csvWriter, open('reading.csv') as csvfile:
writer = csv.writer(csvWriter)
readCSV = csv.reader(csvfile, delimiter=',' )
"""csvList = list(readCSV)
#finds where the position of the data I want to pull out and write to write.csv
itemCode = csvList[0].index(headers[0])
vendorName = csvList[0].index(headers[1])
supplierID = csvList[0].index(headers[2])
supplierItemCode = csvList[0].index(headers[3])"""
for row in readCSV:
writer.writerow([row[0], row[27], row[28], row[29]])
It looks like you want to write a subset of columns to a new file. This problem is simpler with DictReader/DictWriter. Note the correct use of open when using Python 3.x. Your attempt was using the Python 2.x way.
import csv
# headers you want in the order you want
headers = ['header1','header5','header6','header7']
with open('write.csv','w',newline='') as csvWriter,open('read.csv',newline='') as csvfile:
writer = csv.DictWriter(csvWriter,fieldnames=headers,extrasaction='ignore')
readCSV = csv.DictReader(csvfile)
writer.writeheader()
for row in readCSV:
writer.writerow(row)
Test data:
header1,header2,header3,header4,header5,header6,header7
1,2,3,4,5,6,7
11,22,33,44,55,66,77
Output:
header1,header5,header6,header7
1,5,6,7
11,55,66,77
if you want to access both writer under the same block,you should do something like this
with open("write.csv", 'wb') as csvWriter,open('reading.csv') as csvfile:
writer = csv.writer(csvWriter)
readCSV = csv.reader(csvfile, delimiter=',' )
csvList = list(readCSV)
#finds where the position of the data I want to pull out and write to write.csv
itemCode = csvList[0].index(headers[0])
vendorName = csvList[0].index(headers[1])
supplierID = csvList[0].index(headers[2])
supplierItemCode = csvList[0].index(headers[3])
for row in readCSV:
writer.writerow([row[itemCode], row[vendorName], row[supplierID], row[supplierItemCode]])
csvWriter.close()
The with open() as csvWriter: construct handles closing of the supplied file once you exit the block. So once you get down to writer.writerow, the file is already closed.
You need to enclose the entire expression in the with open block.
with open("write.csv", 'wb') as csvWriter:
....
#Do all writing within this block
....

Resources