Method reads properly but the written text file only has 1 line. Is \n not working? - python-3.x

The goal is to extract specific data from a text file under a folder
then write that data into another file under different folder
The extraction part works, save to variables and can even print them
The rises when you try to write them to a file
The file is empty
Need to write in this format
{self.title};;;{self.author};;;{self.release_date};;;
{self.last_update_date};;;{self.language};;;{self.producer};;;{self.book_path}
# This class includes all the operations related to a book
class Operation:
"""
Need to include these class variables
book_title_list (List of all books titles such as “[title1, title2, title3, …]”)
book_info_dict = “{title1: obj1, title2:obj2, title3:obj3…….}”)
"""
book_folder_path = './data/books_data/'
book_info_path = './data/result_data/books.txt'
def extract_book_info(self):
directory_files = os.listdir(self.book_folder_path) # Stores the .txt files under books_data folder
try:
for i in directory_files:
with open(f'{self.book_folder_path}/{i}', 'r', encoding='utf8') as f:
f_contents = f.readlines()
f_line_free = list(map(lambda x: x.strip(), f_contents))
f_lists = f_line_free[10:22] # Slicing only the required elements of the list
"""
Extracting only the necessary part and storing them
under proper variables
"""
title = f_lists[0]
author = f_lists[2]
release_date = f_lists[4]
last_update_date = f_lists[5]
language = f_lists[7]
producer = f_lists[11]
"""
Extracting the desired values
"""
title_data = title[7:]
author_data = author[8:]
release_date_data = release_date[14:]
last_update_date_data = last_update_date[24:-1]
language_data = language[10:]
producer_data = producer[13:]
print(title_data)
with open(self.book_info_path, 'w', encoding="utf8") as wf:
wf.write(f'{title_data};;;{author_data};;;{release_date_data};;;'
f'{last_update_date_data};;;{language_data};;;{producer_data};;;{self.book_info_path}\n')
return True
except FileNotFoundError:
return False
except Exception:
return False

Related

Read data from txt file, store it, use it for analyzing, write it to the txt file

The task is to read from given txt file the data add the numbers in there to the list[], so that every number in a row will be a element/object in this list. After reading the file created list will be sent to the main().
this list with the objects will be parameters for the def Analyze part in where at the same time
will be found min, max, average and sum.
def lueTiedosto(data):
Tiedosto = open("L07T4D1.txt", 'r', encoding="UTF-8")
Rivi = Tiedosto.readline()
while (len(Rivi) > 0):
data.append(int(Rivi))
Rivi = Tiedosto.readline()
for element in data:
print(element)
print(f"Tiedosto L07T4D1.txt luettu.")
Tiedosto.close()
return element
The fixed code which works:
def lueTiedosto(data):
Lue = input("Luettavan tiedoston nimi on ''.\n")
print(f"Anna uusi nimi, enter säilyttää nykyisen: ", end='')
Tiedosto = open(Lue, 'r', encoding="UTF-8")
Rivi = Tiedosto.readline()
while (len(Rivi) > 0):
data.append(int(Rivi))
Rivi = Tiedosto.readline()
print(f"Tiedosto '{Lue}' luettu.")
Tiedosto.close()
return data
Making an assumption that your input file is similar to the following:
10000
12345
10008
12000
I would do the following:
filepath = r".......\L07T4D1.txt" # Path to file being loaded
def readData(filepath: str) -> list[int]:
# Returns a list of integers from file
rslt = []
with open (filepath, 'r') as f:
data = f.readline().strip()
while data:
data = data.split(' ')
rslt.append(int(data[0]))
data = f.readline().strip()
return rslt
def analyze(data: list[int]) -> None:
# prints results of data analysis
print(f'Max Value = {max(data)}')
print(f'Min Value = {min(data)}')
print(f'Sum Value = {sum(data)}')
print(f'Avg Value = {sum(data)/len(data)}')
Running analyze(readData(filepath)) Yields:
Max Value = 12345
Min Value = 10000
Sum Value = 44353
Avg Value = 11088.25

.csv to .arff function on Python

I'm trying to do a convertion function from csv to arff, right now I have this:
def csv2arff(csv_path, arff_path=None):
with open(csv_path, 'r') as fr:
attributes = []
if arff_path is None:
arff_path = csv_path[:-4] + '_prueba.arff' # *.arff -> *.csv
write_sw = False
with open(arff_path, 'w') as fw:
fw.write('#relation base_datos_modelo_3_limpia \n')
firstline = fr.readlines()[0].rstrip()
fw.write(firstline)
and that gives me:
#relation base_datos_modelo_3_limpia
DVJ_Valgus_KneeMedialDisplacement_D_discr,BMI,AgeGroup,ROM-PADF-KE_D,DVJ_Valgus_FPPA_D_discr,TrainFrequency,DVJ_Valgus_FPPA_ND_discr,Asym_SLCMJLanding-pVGRF(10percent)_discr,Asym-ROM-PHIR(≥8)_discr,Asym_TJ_Valgus_FPPA(10percent)_discr,TJ_Valgus_FPPA_ND_discr,Asym-ROM-PHF-KE(≥8)_discr,TJ_Valgus_FPPA_D_discr,Asym_SLCMJ-Height(10percent)_discr,Asym_YBTpl(10percent)_discr,Position,Asym-ROM-PADF-KE(≥8º)_discr,DVJ_Valgus_KneeMedialDisplacement_ND_discr,DVJ_Valgus_Knee-to-ankle-ratio_discr,Asym-ROM-PKF(≥8)_discr,Asym-ROM-PHABD(≥8)_discr,Asym-ROM-PHF-KF(≥8)_discr,Asym-ROM-PHER(≥8)_discr,AsymYBTanterior10percentdiscr,Asym-ROM-PHABD-HF(≥8)_discr,Asym-ROM-PHE(≥8)_discr,Asym(>4cm)-DVJ_Valgus_Knee;edialDisplacement_discr,Asym_SLCMJTakeOff-pVGRF(10percent)_discr,Asym-ROM-PHADD(≥8)_discr,Asym-YBTcomposite(10percent)_discr,Asym_SingleHop(10percent)_discr,Asym_YBTpm(10percent)_discr,Asym_DVJ_Valgus_FPPA(10percent)_discr,Asym_SLCMJ-pLFT(10percent)_discr,DominantLeg,Asym-ROM-PADF-KF(≥8)_discr,ROM-PHER_ND,CPRDmentalskills,POMStension,STAI-R,ROM-PHER_D,ROM-PHIR_D,ROM-PADF-KF_ND,ROM-PADF-KF_D,Age_at_PHV,ROM-PHIR_ND,CPRDtcohesion,Eperience,ROM-PHABD-HF_D,MaturityOffset,Weight,ROM-PHADD_ND,Height,ROM-PHADD_D,Age,POMSdepressio,ROM-PADF-KE_ND,POMSanger,YBTanterior_Dnorm,YBTanterior_NDnorm,POMSvigour,Soft-Tissue_injury_≥4days
So i want to put "#attribute" before each attribute and change the "," to "\n". But don't know how to do it, I tried to make a function to change the "," but didn't work, any idea?
Thank you guys.
Try the liac-arff library.
Here is an example for converting the UCI iris dataset from ARFF to CSV and then back to ARFF:
import csv
import arff
# arff -> csv
content = arff.load(open('./iris.arff', 'r'))
with open('./out.csv', 'w') as fp:
writer = csv.writer(fp)
header = []
for n, t in content['attributes']:
header.append(n)
writer.writerow(header)
writer.writerows(content['data'])
# csv -> arff
with open('./out.csv', 'r') as fp:
reader = csv.reader(fp)
header = None
data = []
for row in reader:
if header is None:
header = row
else:
data.append(row)
content = {}
content['relation'] = "from my csv file"
content['attributes'] = []
for n in header:
if n == "class":
content['attributes'].append((n, ['Iris-setosa', 'Iris-versicolor', 'Iris-virginica']))
else:
content['attributes'].append((n, 'NUMERIC'))
content['data'] = data
with open('./out.arff', 'w') as fp:
arff.dump(content, fp)
NB: For the last stage, we need to specify the nominal class values, which you could determine by scanning the data.

Why can't I split files when generating some TFrecord files?

Why can't I split files when generating some TFrecords files?
I'm doing some job predicting protein stuctures. As you may know, one protein molecule might have different strands. So I need to split the list of the atoms into different TFrecords by the strand name.
The problem is, this code ended up by generating several TFrecords with nothing written. All blank.
Or, is there a method to split the strands while training my module? Then I could ignore this problem and put the strand name in the TFrecords as a feature.
'''
with all module imported and no errors raised
'''
def generate_TFrecord(intPosition, endPosition, path):
CrtS = x #x is the name of the current strand
path = path + CrtS
writer = tf.io.TFRecordWriter('%s.tfrecord' %path)
for i in range(intPosition, endPosition):
if identifyCoreCarbon(i):
vectros = getVectors(i)
features = {}
'''
feeding this dict
'''
tf_features = tf.train.Features(feature = features)
tf_example = tf.train.Example(features = tf_features)
tf_serialized = tf_example.SerializeToString()
writer.write(tf_serialized)
'''
if checkStrand(i) == False:
writer.write(tf_serialized)
intPosition = i
'''
writer.close()
'''
strand_index is a list of all the startpoint of a single strand
'''
for loop in strand_index:
generate_TFrecord(loop, endPosition, path)
'''
________division___________
This code below works, but only generate a single tfrecord containing all the atom imformations.
writer = tf.io.TFRecordWriter('%s.tfrecord' %path)
for i in range(0, endPosition):
if identifyCoreCarbon(i):
vectros = getVectors(i)
features = {}
'''
feeing features
'''
tf_features = tf.train.Features(feature = features)
tf_example = tf.train.Example(features = tf_features)
tf_serialized = tf_example.SerializeToString()
writer.write(tf_serialized)
writer.close()
'''

PdfMiner: Erro processing the page literal required: /b'begin'

I am trying to read .pdf file using python3 with package called pdfminer which I have done successfully but for some of the page in .pdf file while reading the page using interpreter.process_page in getAllPages() of the following code I am getting an errors as follows:
error processing the page literal required: /b'begin'.
error processing the page Unknown operator: 'Qq'.
This is happening only for few docs but not able to find out what is the problem , in which case this could happen?
Code:-
class PDFDoc():
def __init__(self):
self.rsrcmgr = PDFResourceManager()
self.laparams = LAParams()
self.device = PDFPageDetailedAggregator(self.rsrcmgr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.doc_values = []
self.total_no_of_pages = 0
self.doc_page_dict = collections.OrderedDict()
# self.doc = None
"""
Read PDF Document
"""
def readDoc(self, doc_name):
fp = open(doc_name, 'rb')
self.parser = PDFParser(fp)
self.doc = PDFDocument(self.parser)
"""
Read all pages in the document and saved in List of tuples format.
It contains the text and their coordinate info along with page number
"""
def getAllPages(self):
for page in PDFPage.create_pages(self.doc):
self.interpreter.process_page(page)
# receive the LTPage object for this page
self.device.get_result()
self.doc_values = self.device.rows
"""
Get the total number of pages
"""
def getTotalPages(self):
self.total_no_of_pages = max(self.doc_page_dict)+1
"""
Convert the document info into Page-wise dict. {Key:Value}-->{Page no:[Page text, coordinates]}
"""
def getPageDict(self):
for i in range(len(self.doc_values)):
left = self.doc_values[i][1]
bottom = self.doc_values[i][2]
content = self.doc_values[i][-1]
if self.doc_page_dict.get(self.doc_values[i][0]):
self.doc_page_dict[self.doc_values[i][0]].append({'left':left, 'bottom':bottom, 'content':content})
else:
self.doc_page_dict[self.doc_values[i][0]]=[{'left':left, 'bottom':bottom, 'content':content}]
"""
Align the page text in case they are misaligned
"""
def create_page_table_modified(self, pagedict_list):
# ##print(pagedict_list)
page_dict = collections.OrderedDict()
page_table_1 = []
page_table = []
exc_arr = []
count = 0
for line in pagedict_list:
row = []
temp_key = float(line['bottom'])
if not line in exc_arr and line["content"]:
row.append(line)
exc_arr.append(line)
for line_1 in pagedict_list:
if not line_1 in exc_arr and line_1["content"]:
# #print('last_top:', last_top, each_dict_adjusted['bottom'])
if abs(int(line["bottom"]) - int(line_1["bottom"])) <= 6:
row.append(line_1)
exc_arr.append(line_1)
if row:
page_dict[temp_key] = row
page_table.append(row)
count += 1
# ##print("\n\nPage:",page_table)
page_dict_keys = sorted(page_dict, reverse=True)
for i in page_dict_keys:
# i = sorted(i, key=lambda k: k['left'])
page_table_1.append(page_dict[i])
return page_table_1
"""
Sort the line elements based on its position coordinates
"""
def sortRowElements(self,row_list):
return sorted(row_list, key=lambda k:k['left'])
"""
Combine line elements to form the line text
"""
def combineText(self, row):
temp_ = []
# for i in range(len(row)):
text = [k['content'] for k in row]
temp_.append(' '.join(text))
return ' '.join(temp_)
"""
To call aligning and sorting functions
"""
def sortText(self):
for page in self.doc_page_dict:
self.doc_page_dict[page] = self.create_page_table_modified(self.doc_page_dict[page])
self.doc_page_dict[page] = [self.sortRowElements(line) for line in self.doc_page_dict[page]]
"""
To get text from particular page of the document --> List of line text
"""
def pageText(self, page_no):
page_text = [self.combineText(line) for line in self.doc_page_dict[page_no]]
return page_text
read_document = PDFDoc()

Properly using dataclasses to return values of items

The project is to sort items - using a particular algorithm - into boxes. I am having trouble after assigning each items to the proper class, to return to another function and use and modify the data held within the object in the data class.
My testing file looks like this:
17 10 4
Abacus 3
Blender 5
Chessboard 3
Dishes 6
My classes:
#dataclass
class InventoryItem:
name: str
weight: float
#dataclass
class BoxInventory:
name: str
maxWeight: float
remainingWeight: float
contents: dict = ""
"""
def listContents(self, contents):
self.listContents = contents
def remainingWeight(self, remainingWeight):
self.remainingWeight = remainingWeight
def addItemWeight(self, itemWeight):
self.remainingWeight -= itemWeight
def addItemList(self, itemName, itemWeight, contents):
self.contents = contents[itemName] = contents[itemWeight]
"""
Here is where I read my text file and transfer it to a class:
"""
Take the given txt file and format into proper list for boxes and items
:param filename: The filename of the text file
:return: Send lists to to be used by an algo.
"""
with open(filename, 'r') as myFile: # Open the correct file
itemDict = {}
boxDict = {}
myList = [line.split() for line in myFile.readlines()]
boxLine = ' '.join(myList[0])
for line in range(1, len(myList)):
lines = ''.join(myList[line])
itemName = lines[:-1]
weight = lines[len(lines) - 1:]
item = InventoryItem(itemName, int(weight))
itemDict[itemName] = [item]
boxString = ""
count = 0
for char in boxLine:
if char != " ":
boxString = boxString + char
else:
boxName = "Box" + str(count)
box = BoxInventory(boxName, int(boxString), int(boxString))
boxDict[boxName] = [box]
boxString = ""
count += 1
myReturn = {}
myReturn['boxDict'] = boxDict
myReturn['itemDict'] = itemDict
return myReturn
Non-implemented algorithm:
def roomiest(myReturnDict):
"""
For each item find the box with the greatest remaining allowed weight that can support the item and place the item in that box
:param boxList: The list of boxes in the class from the given file
:param itemList: The list of items in the class from the given file
:return: If boxes were able to fit all items(1); items in box with individual weights(2); Box name with max
weight(3); items with their weights that were left behind(4)
"""
itemList = myReturnDict.get("itemDict")
boxList = myReturnDict.get("boxDict")
My problem is that I do know how to read the parsed data from my
fileReader function in my algo. function.
Your input function is a little strange as you're storing the objects in a list of length 1 inside a dictionary. So your data looks like:
'Dishes': [InventoryItem(name='Dishes', weight=6)]
instead of
'Dishes': InventoryItem(name='Dishes', weight=6)
You might have a reason for it, but changing itemDict[itemName] = [item] to itemDict[itemName] = item makes your code a little easier to follow (and the same for boxDict[boxName] = [box]). With that change you can access the parsed data easily with the following:
for item_name, item in itemList.items():
print(item.name)
print(item.weight)
This iterates through the itemList dictionary, getting the key, value pairs which in this case is itemName, item (or [item] in your original code. If you don't want to change that, replace item with item[0] in the code above). Then you can access attributes of your Class directly by calling their label.
You can get the box with most space remaining, using
sorted_box_list = (sorted(boxList.values(), key=operator.attrgetter('remainingWeight'), reverse=True))
What I have done is rather than using a dictionay I am using a list to pass on the data to a new function.
Text File --> List --> Dict --> List --> sortedList
Here is my new fileReader function:
def fileReader(filename):
"""
Take the given txt file and format into proper list for boxes and items
:param filename: The filename of the text file
:return: Send lists to to be used by an algo.
"""
with open(filename, 'r') as myFile: # Open the correct file
itemList = []
boxList = []
myList = [line.split() for line in myFile.readlines()]
boxLine = ' '.join(myList[0])
for line in range(1, len(myList)):
lines = ''.join(myList[line])
itemName = lines[:-1]
weight = lines[len(lines) - 1:]
item = InventoryItem(itemName, int(weight))
itemList.append(item)
boxString = ""
count = 0
for char in boxLine:
if char != " ":
boxString = boxString + char
else:
boxName = "Box" + str(count)
box = BoxInventory(boxName, int(boxString), int(boxString))
boxList.append(box)
boxString = ""
count += 1
I then read and sort the data in each algotithm using this same method:
def roomiest(myReturnDict):
"""
For each item find the box with the greatest remaining allowed weight that can support the item and place the item in that box
:param boxList: The list of boxes in the class from the given file
:param itemList: The list of items in the class from the given file
:return: If boxes were able to fit all items(1); items in box with individual weights(2); Box name with max
weight(3); items with their weights that were left behind(4)
"""
itemData = list(myReturnDict.get("itemList"))
boxData = list(myReturnDict.get("boxList"))
sortedItemList = sorted(itemData, key=lambda x: x.weight, reverse=True)
sortedBoxList = sorted(boxData, key=lambda x: x.remainingWeight, reverse=True)
myReturn = {}
myReturn['boxList'] = boxList
myReturn['itemList'] = itemList
return myReturn
My dataclasses look like the following:
#dataclass
class InventoryItem:
name: str
weight: float
#dataclass
class BoxInventory:
name: str
maxWeight: float
remainingWeight: float
contents: dict = ""
def itemWeight(item):
print("Weight of", item.name, "is: ", item.weight, "\n")
return item.weight
def remainWeight(box):
print("Rem. weight in ", box.name, "is: ", box.remainingWeight, "\n")
return box.remainingWeight

Resources