How to manage memory consumption in this code using Pool and Itertools - python-3.x

I have a task to make an iteration from "aaa, aab, aac...aa1...999" using multiprocessing. Though it works, if I make this more than 5 symbols long - it will crush. Please help to make it lighter!
This iteration eventually gets written into a txt file
import logging
import multiprocessing
import pathlib
import uuid
from itertools import product
from logging import Logger
from multiprocessing import Pool
from string import ascii_lowercase, digits
from typing import Iterable
STORAGE = pathlib.Path(__file__).parent.joinpath("words_storage")
def init_logger() -> Logger:
logger = logging.getLogger()
logging.basicConfig(level=logging.INFO)
return logger
class Combinator:
def __init__(
self,
alphabet: Iterable,
str_len: int,
purge_after: bool = True,
logger: Logger = init_logger(),
):
self.alphabet = alphabet
self.str_len = abs(str_len)
self.purge_after = purge_after
self.logger = logger
def product(self):
self.logger.info("[START] Combining started [START]")
self.pre_clean()
try:
with Pool(processes=multiprocessing.cpu_count() - 1) as pool:
result = pool.map(self._get_for_char, self.alphabet)
self.logger.info("[PROCESS] List of filenames received [PROCESS]")
for name in result:
self.logger.info(
f"[PROCESS] Extracting combinations from {name} [PROCESS]"
)
with open(
STORAGE.joinpath("result/result.txt"), mode="a"
) as file_result:
with open(name) as producer:
file_result.write(producer.read())
finally:
if self.purge_after:
self.logger.info(
"[CLEANING] Purging created .txt files in storage [CLEANING]"
)
Combinator.purge_files()
self.logger.info(
"[END] All combinations created successfully. See result.txt file [END]"
)
def combinations_generator(self, char):
for starter_combination in product(self.alphabet, repeat=self.str_len - 1):
combination = [char]
combination.extend(starter_combination)
yield combination
def _get_for_char(self, char) -> pathlib.Path:
self.logger.info(f"[PROCESS] Processing char: {char} [PROCESS]")
file_name = STORAGE.joinpath(f"{uuid.uuid4()}.txt")
result = self.combinations_generator(char)
with open(file_name, mode="w") as file:
file.write(
Combinator.writer("".join(combination) for combination in result)
)
return file_name
def pre_clean(self):
try:
STORAGE.joinpath("result/result.txt").unlink()
self.logger.info("[PRE-CLEAN] Result directory cleared [PRE-CLEAN]")
except FileNotFoundError:
self.logger.info("[PRE-CLEAN] Result directory already clear [PRE-CLEAN]")
#staticmethod
def writer(result_list: Iterable[str]) -> str:
return "\n".join(result_list)
#staticmethod
def purge_files():
for file in STORAGE.glob("*.txt"):
STORAGE.joinpath(file).unlink()
if __name__ == "__main__":
repeats = 5
alphabet_for_combinations = ascii_lowercase + digits
combinator = Combinator(alphabet_for_combinations, repeats, purge_after=True)
combinator.product()
I tried regulating the number of processes and and the maxtasksperchild number, but it didn't help.

Related

How to publish to Pub/Sub from Dataflow in batch (efficiently)?

I want to publish messages to a Pub/Sub topic with some attributes thanks to Dataflow Job in batch mode.
My dataflow pipeline is write with python 3.8 and apache-beam 2.27.0
It works with the #Ankur solution here : https://stackoverflow.com/a/55824287/9455637
But I think it could be more efficient with a shared Pub/Sub Client : https://stackoverflow.com/a/55833997/9455637
However an error occurred:
return StockUnpickler.find_class(self, module, name) AttributeError:
Can't get attribute 'PublishFn' on <module 'dataflow_worker.start'
from
'/usr/local/lib/python3.8/site-packages/dataflow_worker/start.py'>
Questions:
Would the shared publisher implementation improve beam pipeline performance?
Is there another way to avoid pickling error on my shared publisher client ?
My Dataflow Pipeline :
import apache_beam as beam
from apache_beam.io.gcp import bigquery
from apache_beam.options.pipeline_options import PipelineOptions
from apache_beam.options.pipeline_options import SetupOptions
from google.cloud.pubsub_v1 import PublisherClient
import json
import argparse
import re
import logging
class PubsubClient(PublisherClient):
def __reduce__(self):
return self.__class__, (self.batch_settings,)
# The DoFn to perform on each element in the input PCollection.
class PublishFn(beam.DoFn):
def __init__(self):
from google.cloud import pubsub_v1
batch_settings = pubsub_v1.types.BatchSettings(
max_bytes=1024, # One kilobyte
max_latency=1, # One second
)
self.publisher = PubsubClient(batch_settings)
super().__init__()
def process(self, element, **kwargs):
future = self.publisher.publish(
topic=element["topic"],
data=json.dumps(element["data"]).encode("utf-8"),
**element["attributes"],
)
return future.result()
def run(argv=None, save_main_session=True):
"""Main entry point; defines and runs the pipeline."""
parser = argparse.ArgumentParser()
parser.add_argument(
"--source_table_id",
dest="source_table_id",
default="",
help="BigQuery source table <project>.<dataset>.<table> with columns (topic, attributes, data)",
)
known_args, pipeline_args = parser.parse_known_args(argv)
# We use the save_main_session option because one or more DoFn's in this
# workflow rely on global context (e.g., a module imported at module level).
pipeline_options = PipelineOptions(pipeline_args)
# pipeline_options.view_as(SetupOptions).save_main_session = save_main_session
bq_source_table = known_args.source_table_id
bq_table_regex = r"^(?P<PROJECT_ID>[a-zA-Z0-9_-]*)[\.|\:](?P<DATASET_ID>[a-zA-Z0-9_]*)\.(?P<TABLE_ID>[a-zA-Z0-9_-]*)$"
regex_match = re.search(bq_table_regex, bq_source_table)
if not regex_match:
raise ValueError(
f"Bad BigQuery table id : `{bq_source_table}` please match {bq_table_regex}"
)
table_ref = bigquery.TableReference(
projectId=regex_match.group("PROJECT_ID"),
datasetId=regex_match.group("DATASET_ID"),
tableId=regex_match.group("TABLE_ID"),
)
with beam.Pipeline(options=pipeline_options) as p:
(
p
| "ReadFromBqTable" #
>> bigquery.ReadFromBigQuery(table=table_ref, use_json_exports=True) # Each row contains : topic / attributes / data
| "PublishRowsToPubSub" >> beam.ParDo(PublishFn())
)
if __name__ == "__main__":
logging.getLogger().setLevel(logging.INFO)
run()
After fussing with this a bit, I think I have an answer that works consistently and is, if not world-beatingly performant, at least tolerably usable:
import logging
import apache_beam as beam
from apache_beam.io.gcp.pubsub import PubsubMessage
from google.cloud.pubsub_v1 import PublisherClient
from google.cloud.pubsub_v1.types import (
BatchSettings,
LimitExceededBehavior,
PublishFlowControl,
PublisherOptions,
)
class PublishClient(PublisherClient):
"""
You have to override __reduce__ to make PublisherClient pickleable 😡 😤 🤬
Props to 'Ankur' and 'Benjamin' on SO for figuring this part out; god knows
I would not have...
"""
def __reduce__(self):
return self.__class__, (self.batch_settings, self.publisher_options)
class PubsubWriter(beam.DoFn):
"""
beam.io.gcp.pubsub does not yet support batch operations, so
we do this the hard way. it's not as performant as the native
pubsubio but it does the job.
"""
def __init__(self, topic: str):
self.topic = topic
self.window = beam.window.GlobalWindow()
self.count = 0
def setup(self):
batch_settings = BatchSettings(
max_bytes=1e6, # 1MB
# by default it is 10 ms, should be less than timeout used in future.result() to avoid timeout
max_latency=1,
)
publisher_options = PublisherOptions(
enable_message_ordering=False,
# better to be slow than to drop messages during a recovery...
flow_control=PublishFlowControl(limit_exceeded_behavior=LimitExceededBehavior.BLOCK),
)
self.publisher = PublishClient(batch_settings, publisher_options)
def start_bundle(self):
self.futures = []
def process(self, element: PubsubMessage, window=beam.DoFn.WindowParam):
self.window = window
self.futures.append(
self.publisher.publish(
topic=self.topic,
data=element.data,
**element.attributes,
)
)
def finish_bundle(self):
"""Iterate over the list of async publish results and block
until all of them have either succeeded or timed out. Yield
a WindowedValue of the success/fail counts."""
results = []
self.count = self.count + len(self.futures)
for fut in self.futures:
try:
# future.result() blocks until success or timeout;
# we've set a max_latency of 60s upstairs in BatchSettings,
# so we should never spend much time waiting here.
results.append(fut.result(timeout=60))
except Exception as ex:
results.append(ex)
res_count = {"success": 0}
for res in results:
if isinstance(res, str):
res_count["success"] += 1
else:
# if it's not a string, it's an exception
msg = str(res)
if msg not in res_count:
res_count[msg] = 1
else:
res_count[msg] += 1
logging.info(f"Pubsub publish results: {res_count}")
yield beam.utils.windowed_value.WindowedValue(
value=res_count,
timestamp=0,
windows=[self.window],
)
def teardown(self):
logging.info(f"Published {self.count} messages")
The trick is that if you call future.result() inside the process() method, you will block until that single message is successfully published, so instead collect a list of futures and then at the end of the bundle make sure they're all either published or definitively timed out. Some quick testing with one of our internal pipelines suggested that this approach can publish 1.6M messages in ~200s.

Duplicate a string in Tkinter when redirecting output to a text field

I redirect all my output to the program text field in Tkinter and I wanted to add a date and time to the message:
class StdRedirector(object):
def __init__(self, text_field):
self.text_field = text_field
def write(self, string):
msg_time = datetime.now().strftime('%m-%d %H:%M:%S')
self.text_field.configure(state='normal')
self.text_field.insert('end', f'{msg_time} >> {string}')
self.text_field.see('end')
self.text_field.configure(state='disabled')
class App:
def __init__(self):
self.builder = pygubu.Builder()
self.__init_ui()
self.__init_callbacks()
self.mainwindow.mainloop()
def __init_ui(self):
self.builder.add_from_file(path.join(base_dir, 'assets', 'app.ui'))
self.mainwindow = self.builder.get_object('mainwindow')
self.output_text_field = self.builder.get_object('output_text_field')
sys.stdout = StdRedirector(self.output_text_field)
sys.stderr = StdRedirector(self.output_text_field)
def __init_callbacks(self):
callbacks = {
'update_balance_function': self.__update_balance
}
self.builder.connect_callbacks(callbacks)
def __update_balance(self):
print(True)
But the date line I added is duplicated:
As I understand it, the line is separated by the line separator \n and each substring is sent separately, including line break. Can I fix it somehow?
You can simply check whether the string argument in write() contains any meaningful content, e.g. using ìf string.strip():
class StdRedirector(object):
def __init__(self, text_field):
self.text_field = text_field
def write(self, string):
self.text_field.configure(state='normal')
if string.strip(): # add date before message
msg_time = datetime.now().strftime('%m-%d %H:%M:%S')
self.text_field.insert('end', f'{msg_time} >> {string}')
else: # simply insert string
self.text_field.insert('end', string)
self.text_field.see('end')
self.text_field.configure(state='disabled')

How to find out how long a search for files will take on python?

So I have a little app that searches for all xml files on my pc, copying the files that have 44 digits as the filename to the "output" folder.
The problem is that the final user needs an indication of the progress and remaining time of the task.
This is the module to copy files:
xml_search.py
import os
import re
from threading import Thread
from datetime import datetime
import time
import shutil
import winsound
os.system('cls')
def get_drives():
response = os.popen("wmic logicaldisk get caption")
list1 = []
t1 = datetime.now()
for line in response.readlines():
line = line.strip("\n")
line = line.strip("\r")
line = line.strip(" ")
if (line == "Caption" or line == ""):
continue
list1.append(line + '\\')
return list1
def search1(drive):
for root, dir, files in os.walk(drive):
for file in files:
if re.match("\d{44}.xml", file):
filename = os.path.join(root, file)
try:
shutil.copy(filename, os.path.join('output', file))
except Exception as e:
pass
def exec_(callback):
t1 = datetime.now()
list2 = [] # empty list is created
list1 = get_drives()
for each in list1:
process1 = Thread(target=search1, args=(each,))
process1.start()
list2.append(process1)
for t in list2:
t.join() # Terminate the threads
t2 = datetime.now()
total = str(t2-t1)
print(total, file=open('times.txt', 'a'), end="\n")
for x in range(3):
winsound.Beep(2000,100)
time.sleep(.1)
callback()
if __name__ == "__main__":
exec_()
The below code uses progressbar library and it shows
indication of the progress and remaining time of the task
import progressbar
from time import sleep
bar = progressbar.ProgressBar(maxval=1120, \
widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.ETA()])
bar.start()
for i in range(1120):
bar.update(i+1)
sleep(0.1)
bar.finish()
You would need to add the above modified code to your code.
So in your case, you would need to count the number of files and provide it as input to ProgressBar constructor's maxval argument and remove sleep call.
The suggested solution with progress bar should work with one thread. You would need to figure out how to initiate the progress bar and where to put the updates if you insist to work with multiple threads.
Try to implement a timer decorator like the following:
import time
def mytimer(func):
def wrapper():
t1 = time.time()
result = func()
t2 = time.time()
print(f"The function {func.__name__} was run {t2 - t1} seconds")
return result
return wrapper
#mytimer
def TimeConsumingFunction():
time.sleep(3)
print("Hello timers")
TimeConsumingFunction()
Output:
/usr/bin/python3.7 /home/user/Documents/python-workspace/timers/example.py
Hello timers
The function TimeConsumingFunction was run 3.002610206604004 seconds
Process finished with exit code 0

How to write to text file in python?

I am a beginner in using python. I have created a plain text file and have to encrypt it to output file. But I am getting an error as below and unable to write it to output file. The code is running but the output file which should be encrypted is created.
#!/usr/bin/env python3
import os
import binascii
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding
from cryptography.hazmat.backends import default_backend
import argparse
def readfile_binary(file):
with open(file, 'rb') as f:
content = f.read()
return content
def writefile_binary(file, content):
with open(file, 'wb') as f:
f.write(content)
def main():
parser = argparse.ArgumentParser(description = 'Encryption and Decryption of the file')
parser.add_argument('-in', dest = 'input', required = True)
parser.add_argument('-out', dest = 'output', required = True)
parser.add_argument('-K', dest = 'key', help = 'The key to be used for encryption must be in hex')
parser.add_argument('-iv', dest = 'iv', help = 'The inintialisation vector, must be in hex')
args = parser.parse_args()
input_content = readfile_binary(args. input)
output_content = writefile_binary(args. output)
if __name__ == "__main__":
main()
The output file should be encrypted and it should be available in the directory.
These two lines:
input_content = readfile_binary(args. input)
output_content = writefile_binary(args. output)
There should not be a space in args.input. Here is an example,
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('filename')
args = parser.parse_args()
# using type hints can help reasoning about code
def write(filename: str, content: str) -> None:
with open(filename, 'wb') as f:
f.write(str.encode(content))
# if the filename was successfully parsed from stdin
if args.filename == 'filename.txt':
print(f"args: {args.filename}")
# write to the appropriate output file
write(filename=args.filename, content="content")
You might need to correct your code's indentation. Python requires indenting code within each function definition, loop, etc.
And as eric points out, there should be no spaces after the periods in args. input and args. output. Change those to args.input and args.output instead.
So:
#!/usr/bin/env python3
import os
import binascii
from cryptography.hazmat.primitives.ciphers import Cipher, algorithms, modes
from cryptography.hazmat.primitives import padding
from cryptography.hazmat.backends import default_backend
import argparse
def readfile_binary(file):
with open(file, 'rb') as f:
content = f.read()
return content
def writefile_binary(file, content):
with open(file, 'wb') as f:
f.write(content)
def main():
parser = argparse.ArgumentParser(description = 'Encryption and Decryption of the file')
parser.add_argument('-in', dest = 'input', required = True)
parser.add_argument('-out', dest = 'output', required = True)
parser.add_argument('-K', dest = 'key', help = 'The key to be used for encryption must be in hex')
parser.add_argument('-iv', dest = 'iv', help = 'The inintialisation vector, must be in hex')
args = parser.parse_args()
input_content = readfile_binary(args.input)
output_content = writefile_binary(args.output)
if __name__ == "__main__":
main()

Why are my class functions executed when importing the class?

it's probably a very basic question but I was unable to find an answer that I could thoroughly understand.
In my main program main_program.py, I'm importing a class that itself imports another class:
in main_program.py:
from createTest import *
in createTest.py:
print("TEST")
from recordRecallQused import *
print("TEST")
now in recordRecallQused:
class recordRecallQused:
def __init__(self, path):
self.path = path
try:
with open(self.path, 'r',newline = '') as question_used:
question_used.closed
except IOError:
#if file doesnt exist
print("the file doesn't exist")
with open(self.path, 'w',newline = '') as question_used:
question_used.closed
def recallQused(self):
list_Qused = []
print("I'm being executed")
with open(self.path, 'r',newline = '') as question_used:
questionused = csv.reader(question_used)
for item in questionused:
if len(item)>0:
list_Qused.append(item[0])
question_used.closed
return list_Qused
What I obtain in the kernel:
>TEST
>I'm being executed
>TEST
so functions inside the class are executed even though they are not called, but I have read that it's "normal", "def" are no statements but "live" things.
Still, I have tried something much more simple:
in main_program_TEST.py
from class1 import *
a = class1()
in class1.py:
print("t")
from class2 import *
print("t")
class class1:
def __init__(self):
pass
def message(self):
print("prout")
in class2.py:
class class2:
def __init__(self):
pass
def message(self):
print("prout2")
When executing main_program_TEST.py the kernel displays
>t
>t
so this time the functions in class2.py have not been executed, otherwise the kernel would show instead:
>t
>prout2
>t
I really wonder why.
Stephen Rauch you are right, part of my code in recordRecallQused.py was calling the function.
"""#load all list
print("loading questions info")
# questions info: answers, category
list_AllQ = []
with open('questionsInfoTo130.csv', newline = '') as csvfile:
questionsInfo = csv.reader(csvfile)
# loop over the questions information rows
for (i,row) in enumerate(questionsInfo):
if(i!=0):
list_AllQ.append(row)
csvfile.close()
path = 'question_used.csv'"""
list_AllQ = [[0,1,2,1,"que"],[0,1,2,2,"que"],[0,1,2,3,"que"],[0,1,2,4,"que"],[0,1,2,55,"que"],[0,1,2,6,"que"],[0,1,2,7,"que"],[0,1,2,8,"que"],[0,1,2,9,"que"]]
a = recordRecallQused('question_used.csv')
list_Qused = a.recallQused()
list_avQ = a.createListavQ(list_Qused, list_AllQ)
list_Qtest = a.createListQtest(list_avQ)
a.recordQused(list_Qtest)

Resources