Remove punctuation from list - string

I am working on setting up some usable data for semantic analysis. I have a corpus of raw text data that I am iterating over. I open the data, read it as a string, split into a list, and prepare the data to be built into a dataset in a later function. However, when I build the dataset, my most common words end up being punctuation. I need to remove all punctuation from the list before I process the data further.
import os
import collections
import string
import sys
import tensorflow as tf
import numpy as np
from six.moves import xrange
totalvocab = []
#Loop for: loop through all files in 'Data' directory
for subdir, dirs, files in os.walk('Data'):
for file in files:
filepath = subdir + os.sep + file
print(filepath)
#Function for: open file, convert input to string, split into list
def read_data(filepath):
with open(filepath, 'r') as f:
data = tf.compat.as_str(f.read()).split()
return data
#Run function on data, add file data to full data set.
filevocab = read_data(filepath)
totalvocab.extend(filevocab)
filevocab_size = len(filevocab)
print('File vocabulary size: %s' % filevocab_size)
totalvocab_size = len(totalvocab)
print('Total vocabulary size: %s' % totalvocab_size)
If I do the following:
def read_data(filepath):
with open(filepath, 'r') as f:
data = tf.compat.as_str(f.read())
data.translate(string.punctuation)
data.split()
return data
The words are split into individual letters.
Any other methods I have attempted have errored out.

There are a couple of errors in the code:
str.split() and str.translate() do not modify in place.
str.translate() expects a mapping.
To fix:
def read_data(filepath):
with open(filepath, 'r') as f:
data = tf.compat.as_str(f.read())
data = data.translate(str.maketrans('', '', string.punctuation))
return data.split()
Removing punctuation, may or may not do what you want, e.g. hyphenated words will become concatenated. You could alternatively identify punctuation that you would replace with a space.

Related

with open(os.path.join(directory, filename), 'r','utf-8') as file: TypeError: 'str' object cannot be interpreted as an integer

why iam getting this error
import os
import string
from nltk.corpus import stopwords
# Get a list of English stop words
stop_words = stopwords.words('english')
# Set the input and output directories
input_dir = 'C:\\Users\\acer\\OneDrive\\Desktop\\extracted_data'
output_dir = 'C:\\Users\\acer\OneDrive\\Desktop\\cleaned extracted data'
# Iterate over the files in the input directory
for filename in os.listdir(input_dir):
# Read the text file
with open(os.path.join(input_dir, filename), 'r','utf-8') as file:
text = file.read()
# Split the text into a list of words
words = text.split()
# Remove punctuation from each word
words = [word.translate(str.maketrans('', '', string.punctuation)) for word in words]
# Remove stop words from the list of words
cleaned_words = [word for word in words if word.lower() not in stop_words]
# Join the cleaned words into a single string
cleaned_text = ' '.join(cleaned_words)
# Write the cleaned text to a new file
with open(os.path.join(output_dir, filename), 'w',"utf-8") as file:
file.write(cleaned_text)
this is my code,i was cleaning my extracted data from a website using stopwords suddenly this error occured
I just solved the error by importing codecs. If you want to open a binary file in read mode and specify the encoding, you can use the codecs module like this:
import codecs
with codecs.open(os.path.join(directory, filename), 'r', encoding='utf-8')
as file:

Searching a list of words from textfile and printing first three lines using python

I have a text file in that i have to access particular headings and access the first lines under the heading.I was able to do it for one heading while doing multiple heading i was facing issue.
I have successfully done for one heading. but doing to list of words i was unable to do it.
i was able to do it for one heading
Data =['work:']
i was not able to do it for this scenario.
Data =['work:','test:','ride:']
In the text file the data is like below:
work:
'500'
'ast'
'800'
test:
'tim'
'200'
'300'
ride:
'mic'
'100'
'657'
import math
import os
import glob
import re
import sys
sys.path.append('C:/Documents/tes')
def read(file,Data,res,outputline):
with open(file,'r') as f:
stc_file = os.path.basename(file)
for line in f:
if Data in line:
line = f.readlines()
return line[outputline]
fls = []
src_dir = r'C:/Documents/tes'
for root, dirs, files in os.walk(src_dir):
for filename in files:
if not filename.endswith('.txt'):
continue
filepath = os.path.join(root, filename)
fls.append(filepath)
result = []
Data = ['work:','test:','ride:']
for file in fls:
result=read(file,Data,result,0).split()+read(file,Data,result,1).split()+read(file,Data,result,2).split()
The above code is working for one heading,but for multiple headings i was not able to do.
['500','ast','800']
['tim','200','300']
['mic','100','657']
This above expected output .
This script will do what you asked, if each of the three (not sure if you wanted more, or an arbitrary number?) lines of data you are looking for are surrounded by single quotes—and if I understood your goal correctly...
import os
src_dir = os.getcwd() # or whatever path you want
keywords = ['work:', 'test:', 'ride:']
result = []
for root, dirs, files in os.walk(src_dir):
for filename in files:
if filename.endswith('.txt'):
path = os.path.join(root, filename)
try:
fh = open(path, 'r')
lines = [l.strip("'") for l in fh.read().splitlines()]
for i in range(len(lines)):
if lines[i] in keywords:
result.append(' '.join(lines[i+1:i+4]).split())
except Exception as e:
print('Something went wrong.')
print(e)
continue
print(result)

wrting more than one file while autoincrementing file name failed (python 3.6)

I'm using PyPDF2 to crop PDF files in a directory. There are 2 of them. My goal is to crop both files and save the new cropped file as "article_0.pdf" and "article_1.pdf":
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
directory=os.getcwd()
for file in os.listdir(directory):
reader = PdfFileReader(file,'r')
writer = PdfFileWriter()
for i in range(reader.getNumPages()):
page = reader.getPage(i)
page.cropBox.setUpperLeft((0,720))
page.cropBox.setLowerLeft((0,70))
page.cropBox.setLowerRight((612,70))
page.cropBox.setUpperRight((612,720))
writer.addPage(page)
for index in range(2):
filename = 'article_%d.pdf'%index
with open(filename,'wb') as f:
writer.write(f)
The cropping is done properly and the output files are named "article_0.pdf" and "article_1.pdf" as expected. However, both output files are the cropped version of only one of the original files. Why is the cropped version of the other file not in the output files? Thanks!
UPDATE: I tried fixing the indents like this but still got the same results:
from PyPDF2 import PdfFileReader, PdfFileWriter
import os
directory=os.getcwd()
for file in os.listdir(directory):
reader = PdfFileReader(file,'r')
writer = PdfFileWriter()
for i in range(reader.getNumPages()):
page = reader.getPage(i)
page.cropBox.setUpperLeft((0,720))
page.cropBox.setLowerLeft((0,70))
page.cropBox.setLowerRight((612,70))
page.cropBox.setUpperRight((612,720))
writer.addPage(page)
for index in range(2):
filename = 'article_%d.pdf'%index
with open(filename,'wb') as f:
writer.write(f)
The loop you're using to write your two files doesn't do anything useful. It always writes two files with the same contents, because the contents of writer don't change during the loop.
Instead, you should be writing once per iteration of the first loop. To get an index, use enumerate to get an index along with the file name when you call os.listdir:
for index, file in enumerate(os.listdir(directory)): # compute index up here with enumerate
reader = PdfFileReader(file,'r')
writer = PdfFileWriter()
for i in range(reader.getNumPages()):
...
filename = 'article_%d.pdf' % index # no extra loop is needed down here
with open(filename,'wb') as f:
writer.write(f)

Exporting Python to Excel is only showing one row of data [duplicate]

I have data which is being accessed via http request and is sent back by the server in a comma separated format, I have the following code :
site= 'www.example.com'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(site,headers=hdr)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
soup = soup.get_text()
text=str(soup)
The content of text is as follows:
april,2,5,7
may,3,5,8
june,4,7,3
july,5,6,9
How can I save this data into a CSV file.
I know I can do something along the lines of the following to iterate line by line:
import StringIO
s = StringIO.StringIO(text)
for line in s:
But i'm unsure how to now properly write each line to CSV
EDIT---> Thanks for the feedback as suggested the solution was rather simple and can be seen below.
Solution:
import StringIO
s = StringIO.StringIO(text)
with open('fileName.csv', 'w') as f:
for line in s:
f.write(line)
General way:
##text=List of strings to be written to file
with open('csvfile.csv','wb') as file:
for line in text:
file.write(line)
file.write('\n')
OR
Using CSV writer :
import csv
with open(<path to output_csv>, "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
for line in data:
writer.writerow(line)
OR
Simplest way:
f = open('csvfile.csv','w')
f.write('hi there\n') #Give your csv text here.
## Python will convert \n to os.linesep
f.close()
You could just write to the file as you would write any normal file.
with open('csvfile.csv','wb') as file:
for l in text:
file.write(l)
file.write('\n')
If just in case, it is a list of lists, you could directly use built-in csv module
import csv
with open("csvfile.csv", "wb") as file:
writer = csv.writer(file)
writer.writerows(text)
I would simply write each line to a file, since it's already in a CSV format:
write_file = "output.csv"
with open(write_file, "wt", encoding="utf-8") as output:
for line in text:
output.write(line + '\n')
I can't recall how to write lines with line-breaks at the moment, though :p
Also, you might like to take a look at this answer about write(), writelines(), and '\n'.
To complement the previous answers, I whipped up a quick class to write to CSV files. It makes it easier to manage and close open files and achieve consistency and cleaner code if you have to deal with multiple files.
class CSVWriter():
filename = None
fp = None
writer = None
def __init__(self, filename):
self.filename = filename
self.fp = open(self.filename, 'w', encoding='utf8')
self.writer = csv.writer(self.fp, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL, lineterminator='\n')
def close(self):
self.fp.close()
def write(self, elems):
self.writer.writerow(elems)
def size(self):
return os.path.getsize(self.filename)
def fname(self):
return self.filename
Example usage:
mycsv = CSVWriter('/tmp/test.csv')
mycsv.write((12,'green','apples'))
mycsv.write((7,'yellow','bananas'))
mycsv.close()
print("Written %d bytes to %s" % (mycsv.size(), mycsv.fname()))
Have fun
What about this:
with open("your_csv_file.csv", "w") as f:
f.write("\n".join(text))
str.join() Return a string which is the concatenation of the strings in iterable.
The separator between elements is
the string providing this method.
In my situation...
with open('UPRN.csv', 'w', newline='') as out_file:
writer = csv.writer(out_file)
writer.writerow(('Name', 'UPRN','ADMIN_AREA','TOWN','STREET','NAME_NUMBER'))
writer.writerows(lines)
you need to include the newline option in the open attribute and it will work
https://www.programiz.com/python-programming/writing-csv-files

Loop only iterates first instance and how do I extract single filenames via os.walk?

Objective: A script that searches a directory for axograph files and performs as series of computations and returns either the values or saves it to CSV file.
Update: Data comes from Axographio raw files of intracellular recordings where each file has 4 waves of data. I seek to do the very basic ehpys calculations of properties:
input resistance,
half-width at half-maximum,
rheobase, SAG potential,
membrane time constant
afterhyperpolarisation.
I found python module "stimfit" which has useful libraries for this but it does not import properly. I´m currently troubleshooting this package.
This is my first script and first recordings, so if anyone has ideas, please share.
Problem 1. "f = axographio.read(filename)" only takes one argument with a string specific to "filename.axgd". I cant put this method alone since the resulting "file" is a list object and not the single instance of each filenames that I seek. I think that above function can work with "f=axographio.read(file)" if I can get that file object as a list or array of the filenames and extract them.
Problem 1 is now solved with f = axographio.read(os.path.join(root, file)) - it was unclear that os.path worked within the axographio module.
Problem 2. I can print all my files as list of strings but unable to extract and open each single file (even with file=open(blablabla.).
Update: I can either loop for each open file or save them to a list or array. I dont mind advice on best practice on this. axograph seems to work well with NumPy array but list is tempting since it has more flexibility. Any advice on this?
Note: its´s not necessary to make this complicated. The key point is to perform computations on multiple files at my choosing. Whether I save this into file or have more or less control is a matter of taste and time.
-*- coding: utf-8 -*-
"""
#author: Martenzi
"""
import os,sys
import numpy as np
import matplotlib.pylab as plt
import axographio
from scipy import stats
import re
import os
from collections import defaultdict
""" Search a directory for all Axograph files """
for root, dirs, files in os.walk("."):
for file in files:
if file.endswith(".axgd"):
f = axographio.read(os.path.join(root, file))
plt.show(file)
plt.plot(f.data[0],f.data[1])
plt.plot(f.data[0],f.data[2])
plt.plot(f.data[0],f.data[3])
plt.plot(f.data[0],f.data[4])
""" Below is various code snippets that I have tried out. They are not in specific order. I have tried things and stacked it as comments """
# if(line == 'foo'):
# line = next(irofile) #BEWARE, This could raise StopIteration!
# print line
# for i in file:
# cells = [];
# cells.append(file);
# for index, w in enumerate (loopme):
# cells = array ( [i] ,dtype=complex)
# print(file)
# for k in file:
# d = defaultdict(int)
# d[k].append()
# m=self.fileFormatRegex.match(file)
# self.processFile(root, open(os.path.join(r"/", root, file)), age, inString)
# infile = open(cells,"r")
# fullpath = os.path.walk(files)
# infile = open(file, "r")
# f = open(file ["r"][buffering])
# with open(infile, mode='r'):
## f = axographio.read(file)
# print(file)
# f = axographio.read(cell)
# with open(fullpath, 'r') as f:
# f = axographio.read(file)
# data = re.sub(r'(\s*function\s+.*\s*{\s*)',
# r'\1echo "The function starts here."',
# f.read())
# with open(fullpath, 'w') as f:
# f.write(data)

Resources