Writing to same file(s) with multiprocessing (avoid lock) - python-3.x

I'm running a script on multiple csv files using multiprocessing.
If a line matches the regex, it writes the line to (a) new file(s) (new file name equals match).
I've noticed a problem writing to the same file(s) from different processes (file lock). How can i fix this ?
My code:
import re
import glob
import os
import multiprocessing
pattern ='abc|def|ghi|jkl|mno'
regex = re.compile(pattern, re.IGNORECASE)
def process_files (file):
res_path = r'd:\results'
with open(file, 'r+', buffering=1) as ifile:
for line in ifile:
matches = set(regex.findall(line))
for match in matches:
res_file = os.path.join(res_path, match + '.csv')
with open(res_file, 'a') as rf:
rf.write(line)
def main():
p = multiprocessing.Pool()
for file in glob.iglob(r'D:\csv_files\**\*.csv', recursive=True):
p.apply_async(process, [file])
p.close()
p.join()
if __name__ == '__main__':
main()
Thanks in advance!

Make the filename unique for each subprocess:
def process_files (file, id):
res_path = r'd:\results'
for line in file:
matches = set(regex.findall(line))
for match in matches:
filename = "{}_{}.csv".format(match, id)
res_file = os.path.join(res_path, filename)
with open(res_file, 'a') as rf:
rf.write(line)
def main():
p = multiprocessing.Pool()
for id, file in enumerate(glob.iglob(r'D:\csv_files\**\*.csv', recursive=True)):
p.apply_async(process, [file, id])
then you will have to add some code to consolidate the different "_.csv" files in single ".csv" files.
Concurrent writes on a same file is something you want to avoid - either you don't have file locks and you end up with corrupted data, or you have file locks and then it will slow down the process, which defeats the whole point of parallelizing it.

Related

How can I make web resources avialable offline?

Their is a folder in my PC with Linux OS, which contains a website (webpages etc.). The webpages and other complimentary files in the folder use cdns to bring resources like jquery, datatables etc.
I want to make these resources offline. I know I can manually search all files for occurrence of "http" keyword, download files from these URLs keep them in folder and accordingly change source file path. But as these are too many files it seems troublesome. I want to ask is there any better and elegant way of doing so. Thanks in advance
I made a python script to do the job:
import re
import os
import aiohttp
import asyncio
import pathlib
import string
import random
import chardet
# Decode byte sequence using chardet to avoid "Type error"
def decode_bytes(byte_sequence):
result = chardet.detect(byte_sequence)
encoding = result['encoding']
return byte_sequence.decode(encoding)
VALID_URL_REGEX = re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
# Downloader, I lazily have used resp.status as success criteria but it have logical issue you can also include other logic
async def download_file(session, url, local_path):
async with session.get(url, allow_redirects=True, ssl=False) as resp:
if resp.status == 200:
print("Content path is "+str(local_path))
with open(local_path, "wb") as f:
while True:
print(local_path)
chunk = await resp.content.read(4196)
if not chunk:
break
chunk = chunk.encode("utf-8")
f.write(chunk)
downloaded_urls = set()
async def process_file(file_path, session):
print("File during Read "+str(file_path))
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
contents = f.read()
try:
contents = decode_bytes(contents)
except UnicodeDecodeError as e:
# To avoid Type error
print(f"Error decoding file {file_path}: {e}")
return
urls = re.findall(VALID_URL_REGEX, contents)
try:
for url in urls:
file_name = url.split("/")[-1]
if len(file_name)==0:
continue
if url in downloaded_urls:
local_path = downloaded_urls[url]
# generating random strings to avoid same file name but different urls
res = ''.join(random.choices(string.ascii_uppercase +string.digits, k=5))
file_name=res+file_name
local_path = os.path.join("downloaded", file_name)
if not os.path.exists(local_path):
await download_file(session, url, local_path)
# To avoid redownloading
downloaded_urls.add(url)
contents = contents.replace(url, local_path)
except:
pass
print("File during write "+str(file_path))
with open(file_path, "w", encoding="utf-8", errors="ignore") as f:
f.write(contents)
async def process_directory(directory):
if not os.path.exists("downloaded"):
os.makedirs("downloaded")
conn = aiohttp.TCPConnector(limit=2200,limit_per_host=20,ttl_dns_cache=22)
async with aiohttp.ClientSession(connector=conn) as session:
tasks = []
try:
for filepath in pathlib.Path(directory).glob('**/*'):
fp=filepath.absolute()
if str(fp).endswith(".md") or str(fp).endswith(".txt"):
continue
if os.path.isfile(fp):
tasks.append(process_file(fp, session))
except:
pass
await asyncio.gather(*tasks)
if __name__ == '__main__':
directory = input("Enter root directory") asyncio.run(process_directory(directory))
I will also try "substitution" module and update answer accordingly.

How to determine the number of columns per row variably in a CSV File with Python?

I am analyzing xml-structured Textfiles about insider dealings. I wrote some code to parse through the XML-structure and write my output in a CSV file. The results of the files are written per line and the analyzed information is written in individual columns. But in some files information is present in multiple times and my code override the information in the cells, in the end only one date is in the cell of my CSV-File.
import csv
import glob
import re
import string
import time
import bs4 as bs
# User defined directory for files to be parsed
TARGET_FILES = r'D:\files\'
# User defined file pointer to LM dictionary
# User defined output file
OUTPUT_FILE = r'D:\ouput\Parser.csv'
# Setup output
OUTPUT_FIELDS = [r'Datei', 'transactionDate', r'transactionsCode', r'Director', r'Officer', r'Titel', r'10-% Eigner', r'sonstiges', r'SignatureDate']
def main():
f_out = open(OUTPUT_FILE, 'w')
wr = csv.writer(f_out, lineterminator='\n', delimiter=';')
wr.writerow(OUTPUT_FIELDS)
file_list = glob.glob(TARGET_FILES)
for file in file_list:
print(file)
with open(file, 'r', encoding='UTF-8', errors='ignore') as f_in:
soup = bs.BeautifulSoup(f_in, 'xml')
output_data = get_data(soup)
output_data[0] = file
wr.writerow(output_data)
def get_data(soup):
# overrides the transactionDate if more than one transactions disclosed on the current form
# the number determine the column for the output
_odata = [0] * 9
try:
for item in soup.find_all('transactionDate'):
_odata[1] = item.find('value').text
except AttributeError:
_odata[1] = ('keine Angabe')
try:
for item in soup.find_all('transactionAcquiredDisposedCode'):
_odata[2] = item.find('value').text
except AttributeError:
_odata[2] = 'ka'
for item in soup.find_all('reportingOwnerRelationship'):
try:
_odata[3] = item.find('isDirector').text
except AttributeError:
_odata[3] = ('ka')
try:
_odata[4] = item.find('isOfficer').text
except AttributeError:
_odata[4] = ('ka')
try:
_odata[5] = item.find('officerTitle').text
except AttributeError:
_odata[5] = 'ka'
try:
_odata[6] = item.find('isTenPercentOwner').text
except AttributeError:
_odata[6] = ('ka')
try:
_odata[7] = item.find('isOther').text
except AttributeError:
_odata[7] = ('ka')
try:
for item in soup.find_all('ownerSignature'):
_odata[8] = item.find('signatureDate').text
except AttributeError:
_odata[8] = ('ka')
return _odata
if __name__ == '__main__':
print('\n' + time.strftime('%c') + '\nGeneric_Parser.py\n')
main()
print('\n' + time.strftime('%c') + '\nNormal termination.')
Actually the code works, but overwrites columns if, for e.g. more than one transacion date is given in the file. So I need a code that automatically uses the next column for each transaction date. How could this work?
I would be glad if someone have a solution for my problem. Thanks a lot!
Your issue is that you are iterating over the result of
soup.find_all()
and every time you are writing to the same value. You need to do something with
_odata in each iteration, otherwise you will only end up with whatever is written to it the last time.
If you can show us what the data you're trying to parse actually looks like, perhaps we could give a more specific answer.

Searching a list of words from textfile and printing first three lines using python

I have a text file in that i have to access particular headings and access the first lines under the heading.I was able to do it for one heading while doing multiple heading i was facing issue.
I have successfully done for one heading. but doing to list of words i was unable to do it.
i was able to do it for one heading
Data =['work:']
i was not able to do it for this scenario.
Data =['work:','test:','ride:']
In the text file the data is like below:
work:
'500'
'ast'
'800'
test:
'tim'
'200'
'300'
ride:
'mic'
'100'
'657'
import math
import os
import glob
import re
import sys
sys.path.append('C:/Documents/tes')
def read(file,Data,res,outputline):
with open(file,'r') as f:
stc_file = os.path.basename(file)
for line in f:
if Data in line:
line = f.readlines()
return line[outputline]
fls = []
src_dir = r'C:/Documents/tes'
for root, dirs, files in os.walk(src_dir):
for filename in files:
if not filename.endswith('.txt'):
continue
filepath = os.path.join(root, filename)
fls.append(filepath)
result = []
Data = ['work:','test:','ride:']
for file in fls:
result=read(file,Data,result,0).split()+read(file,Data,result,1).split()+read(file,Data,result,2).split()
The above code is working for one heading,but for multiple headings i was not able to do.
['500','ast','800']
['tim','200','300']
['mic','100','657']
This above expected output .
This script will do what you asked, if each of the three (not sure if you wanted more, or an arbitrary number?) lines of data you are looking for are surrounded by single quotes—and if I understood your goal correctly...
import os
src_dir = os.getcwd() # or whatever path you want
keywords = ['work:', 'test:', 'ride:']
result = []
for root, dirs, files in os.walk(src_dir):
for filename in files:
if filename.endswith('.txt'):
path = os.path.join(root, filename)
try:
fh = open(path, 'r')
lines = [l.strip("'") for l in fh.read().splitlines()]
for i in range(len(lines)):
if lines[i] in keywords:
result.append(' '.join(lines[i+1:i+4]).split())
except Exception as e:
print('Something went wrong.')
print(e)
continue
print(result)

Using os.walk to create a filelist for each directory

I am attempting to use os.walk to create a list of files per subdirectory, and, execute a function to merge all pdf's in each directory list. The current script appends subsequent directories to the existing list with each loop. So, pdfs in directory1 are merged successfully, but, the list for directory2 includes the pdfs from directory1 etc. I want it to refresh the list of files for each directory. Here is the script I am using currently:
import PyPDF2
import os
import sys
if len(sys.argv) > 1:
SearchDirectory = sys.argv[1]
print("I'm looking for PDF's in ", SearchDirectory)
else:
print("Please tell me the directory to look in")
sys.exit()
pdfWriter = PyPDF2.PdfFileWriter()
for root, dirs, files in os.walk(SearchDirectory):
dirs.sort()
for file in files:
files.sort()
pdfFiles = []
if file.endswith('.pdf') and ((os.path.basename(root)) == "frames"):
print("Discovered this pdf: ", os.path.join(root, file))
pdfFiles.append(os.path.join(root, file))
if pdfFiles:
for file in pdfFiles:
pdfFileObj = open(file, 'rb')
pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
for pageNum in range(0, pdfReader.numPages):
pageObj = pdfReader.getPage(pageNum)
pdfWriter.addPage(pageObj)
pdfOutput = open((os.path.split(os.path.realpath(root))[0]) + ".pdf", "wb")
pdfWriter.write(pdfOutput)
pdfOutput.close()
print("The following pdf has been successfully appended:", os.path.join(root, file))
else:
print("No pdfs found in this directory:", root)
The os.walk loop iterates once per directory. So you want to create a new PDFWriter for every directory.
It's also a good idea to use continue to bail out of the loop as soon as possible, this keeps the nesting flat.
Names that start with a capital letter are reserved for classes, so it should be searchDirectory, written with a small s.
Finally, take advantage of with blocks for handling I/O - they automatically call .close() for you.
I'm not going to install PyPDF2 just for this question, but this approach looks reasonable:
for root, dirs, files in os.walk(searchDirectory):
if not os.path.basename(root) == "frames":
continue
pdfFiles = [os.path.join(root, file) for file in sorted(files)]
if not pdfFiles:
continue
pdfWriter = PyPDF2.PdfFileWriter()
outputFile = os.path.split(os.path.realpath(root))[0] + ".pdf"
for file in pdfFiles:
print("Discovered this pdf:", file)
with open(file, 'rb') as pdfInput:
pdfReader = PyPDF2.PdfFileReader(pdfInput)
for page in pdfReader.pages:
pdfWriter.addPage(page)
with open(outputFile, "wb") as pdfOutput:
pdfWriter.write(pdfOutput)
print("%s files appended to %s" % (len(pdfFiles), outputFile))

Exporting Python to Excel is only showing one row of data [duplicate]

I have data which is being accessed via http request and is sent back by the server in a comma separated format, I have the following code :
site= 'www.example.com'
hdr = {'User-Agent': 'Mozilla/5.0'}
req = urllib2.Request(site,headers=hdr)
page = urllib2.urlopen(req)
soup = BeautifulSoup(page)
soup = soup.get_text()
text=str(soup)
The content of text is as follows:
april,2,5,7
may,3,5,8
june,4,7,3
july,5,6,9
How can I save this data into a CSV file.
I know I can do something along the lines of the following to iterate line by line:
import StringIO
s = StringIO.StringIO(text)
for line in s:
But i'm unsure how to now properly write each line to CSV
EDIT---> Thanks for the feedback as suggested the solution was rather simple and can be seen below.
Solution:
import StringIO
s = StringIO.StringIO(text)
with open('fileName.csv', 'w') as f:
for line in s:
f.write(line)
General way:
##text=List of strings to be written to file
with open('csvfile.csv','wb') as file:
for line in text:
file.write(line)
file.write('\n')
OR
Using CSV writer :
import csv
with open(<path to output_csv>, "wb") as csv_file:
writer = csv.writer(csv_file, delimiter=',')
for line in data:
writer.writerow(line)
OR
Simplest way:
f = open('csvfile.csv','w')
f.write('hi there\n') #Give your csv text here.
## Python will convert \n to os.linesep
f.close()
You could just write to the file as you would write any normal file.
with open('csvfile.csv','wb') as file:
for l in text:
file.write(l)
file.write('\n')
If just in case, it is a list of lists, you could directly use built-in csv module
import csv
with open("csvfile.csv", "wb") as file:
writer = csv.writer(file)
writer.writerows(text)
I would simply write each line to a file, since it's already in a CSV format:
write_file = "output.csv"
with open(write_file, "wt", encoding="utf-8") as output:
for line in text:
output.write(line + '\n')
I can't recall how to write lines with line-breaks at the moment, though :p
Also, you might like to take a look at this answer about write(), writelines(), and '\n'.
To complement the previous answers, I whipped up a quick class to write to CSV files. It makes it easier to manage and close open files and achieve consistency and cleaner code if you have to deal with multiple files.
class CSVWriter():
filename = None
fp = None
writer = None
def __init__(self, filename):
self.filename = filename
self.fp = open(self.filename, 'w', encoding='utf8')
self.writer = csv.writer(self.fp, delimiter=';', quotechar='"', quoting=csv.QUOTE_ALL, lineterminator='\n')
def close(self):
self.fp.close()
def write(self, elems):
self.writer.writerow(elems)
def size(self):
return os.path.getsize(self.filename)
def fname(self):
return self.filename
Example usage:
mycsv = CSVWriter('/tmp/test.csv')
mycsv.write((12,'green','apples'))
mycsv.write((7,'yellow','bananas'))
mycsv.close()
print("Written %d bytes to %s" % (mycsv.size(), mycsv.fname()))
Have fun
What about this:
with open("your_csv_file.csv", "w") as f:
f.write("\n".join(text))
str.join() Return a string which is the concatenation of the strings in iterable.
The separator between elements is
the string providing this method.
In my situation...
with open('UPRN.csv', 'w', newline='') as out_file:
writer = csv.writer(out_file)
writer.writerow(('Name', 'UPRN','ADMIN_AREA','TOWN','STREET','NAME_NUMBER'))
writer.writerows(lines)
you need to include the newline option in the open attribute and it will work
https://www.programiz.com/python-programming/writing-csv-files

Resources