I have pdf's distributed over several folders and sub folders.
I've been trying to write a short python script with the idea to search each pdf for any term i enter.
As not all pdf's are searchable, I also tried to implement a list of searchable, and non searchable pdf's with the idea to bring everything in line.
The program seems to work, up to a point. The longer it runs, the slower it goes.
At a certain moment, it just stops. I think it is a memory issue, but i can't seem to find a solution.
The script i have already:
import os
# extracting_text.py
from PyPDF2 import PdfFileReader
search_word = input("enter a word you want to search in file: ")
counter = 0
noTextCounter = 0
SolutionCounter = 0
with open("Solutions.txt", "w") as text_file:
text_file.writelines(f"List of files that contain: {search_word}")
#print(f"List of files that contain: {search_word}", file=text_file)
def text_extractor(path):
with open(path, 'rb') as f:
#variable to find pdf's that only have image. If activated countempty has to be included in the return.
countEmpty = 0
countSolution = 0
pdf = PdfFileReader(f)
# get the first page
page = pdf.getPage(0)
# print(page)
# print('Page type: {}'.format(str(type(page))))
text = page.extractText()
if text == '':
print('No text')
countEmpty = countEmpty + 1
else:
if search_word in text:
print("word found")
countSolution = countSolution + 1
else:
print("word not found")
# print(text)
#Selection of potential returns
#return countEmpty
return countSolution
root = os.getcwd()
try:
for subdir, dirs, files in os.walk(root):
for file in files:
# print os.path.join(subdir, file)
filepath = subdir + os.sep + file
if filepath.endswith(".pdf"):
print(filepath)
counter = counter + 1
print(counter)
if __name__ == '__main__':
path = filepath
indicator = text_extractor(path)
#noTextCounter = noTextCounter + indicator
SolutionCounter = SolutionCounter + indicator
print("indicator: " + str(indicator))
if indicator == 1:
with open("Solutions.txt", "a") as text_file:
text_file.writelines('\n' + path)
#below is option to give 2 lists containing all the pdf's which are images and a list of non images
# #with open("ListOfImagePdfs.txt", "a") as text_file:
# text_file.writelines('\n' + path)
#else:
#with open("ListOfDataPdfs.txt", "a") as text_file:
# text_file.writelines('\n' + path)
#print("amount of image pdf's: " + str(noTextCounter))
except:
pass
#trycatch to be added
I am getting the below error when I am downloading files using multiprocessing. I am downloading Wikipedia page views and they have it by hour so it might include a lot of downloading.
Any recommendation to why this error is caused and HOW TO SOLVE IT? Thanks
MaybeEncodingError: Error sending result:
''. Reason: 'TypeError("cannot serialize
'_io.BufferedReader' object",)'
import fnmatch
import requests
import urllib.request
from bs4 import BeautifulSoup
import multiprocessing as mp
def download_it(download_file):
global path_to_save_document
filename = download_file[download_file.rfind("/")+1:]
save_file_w_submission_path = path_to_save_document + filename
request = urllib.request.Request(download_file)
response = urllib.request.urlopen(request)
data_content = response.read()
with open(save_file_w_submission_path, 'wb') as wf:
wf.write(data_content)
print(save_file_w_submission_path)
pattern = r'*200801*'
url_to_download = r'https://dumps.wikimedia.org/other/pagecounts-raw/'
path_to_save_document = r'D:\Users\Jonathan\Desktop\Wikipedia\\'
def main():
global pattern
global url_to_download
r = requests.get(url_to_download)
data = r.text
soup = BeautifulSoup(data,features="lxml")
list_of_href_year = []
for i in range(2):
if i == 0:
for link in soup.find_all('a'):
lien = link.get('href')
if len(lien) == 4:
list_of_href_year.append(url_to_download + lien + '/')
elif i == 1:
list_of_href_months = []
list_of_href_pageviews = []
for loh in list_of_href_year:
r = requests.get(loh)
data = r.text
soup = BeautifulSoup(data,features="lxml")
for link in soup.find_all('a'):
lien = link.get('href')
if len(lien) == 7:
list_of_href_months.append(loh + lien + '/')
if not list_of_href_months:
continue
for lohp in list_of_href_months:
r = requests.get(lohp)
data = r.text
soup = BeautifulSoup(data,features="lxml")
for link in soup.find_all('a'):
lien = link.get('href')
if "pagecounts" in lien:
list_of_href_pageviews.append(lohp + lien)
matching_list_of_href = fnmatch.filter(list_of_href_pageviews, pattern)
matching_list_of_href.sort()
with mp.Pool(mp.cpu_count()) as p:
print(p.map(download_it, matching_list_of_href))
if __name__ == '__main__':
main()
As Darkonaut proposed. I used multithreading instead.
Example:
from multiprocessing.dummy import Pool as ThreadPool
'''This function is used for the download the files using multi threading'''
def multithread_download_files_func(self,download_file):
try:
filename = download_file[download_file.rfind("/")+1:]
save_file_w_submission_path = self.ptsf + filename
'''Check if the download doesn't already exists. If not, proceed otherwise skip'''
if not os.path.exists(save_file_w_submission_path):
data_content = None
try:
'''Lets download the file'''
request = urllib.request.Request(download_file)
response = urllib.request.urlopen(request)
data_content = response.read()
except urllib.error.HTTPError:
'''We will do a retry on the download if the server is temporarily unavailable'''
retries = 1
success = False
while not success:
try:
'''Make another request if the previous one failed'''
response = urllib.request.urlopen(download_file)
data_content = response.read()
success = True
except Exception:
'''We will make the program wait a bit before sending another request to download the file'''
wait = retries * 5;
time.sleep(wait)
retries += 1
except Exception as e:
print(str(e))
'''If the response data is not empty, we will write as a new file and stored in the data lake folder'''
if data_content:
with open(save_file_w_submission_path, 'wb') as wf:
wf.write(data_content)
print(self.present_extract_RC_from_RS + filename)
except Exception as e:
print('funct multithread_download_files_func' + str(e))
'''This function is used as a wrapper before using multi threading in order to download the files to be stored in the Data Lake'''
def download_files(self,filter_files,url_to_download,path_to_save_file):
try:
self.ptsf = path_to_save_file = path_to_save_file + 'Step 1 - Data Lake\Wikipedia Pagecounts\\'
filter_files_df = filter_files
self.filter_pattern = filter_files
self.present_extract_RC_from_RS = 'WK Downloaded-> '
if filter_files_df == '*':
'''We will create a string of all the years concatenated together for later use in this program'''
reddit_years = [2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017,2018]
filter_files_df = ''
'''Go through the years from 2005 to 2018'''
for idx, ry in enumerate(reddit_years):
filter_files_df += '*' + str(ry) + '*'
if (idx != len(reddit_years)-1):
filter_files_df += '&'
download_filter = list([x.strip() for x in filter_files_df.split('&')])
download_filter.sort()
'''If folder doesn't exist, create one'''
if not os.path.exists(os.path.dirname(self.ptsf)):
os.makedirs(os.path.dirname(self.ptsf))
'''We will get the website HTML elements using beautifulsoup library'''
r = requests.get(url_to_download)
data = r.text
soup = BeautifulSoup(data,features="lxml")
list_of_href_year = []
for i in range(2):
if i == 0:
'''Lets get all href available on this particular page. The first page is the year page'''
for link0 in soup.find_all('a'):
lien0 = link0.get('href')
'''We will check if the length is 4 which corresponds to a year'''
if len(lien0) == 4:
list_of_href_year.append(url_to_download + lien0 + '/')
elif i == 1:
list_of_href_months = []
list_of_href_pageviews = []
for loh in list_of_href_year:
r1 = requests.get(loh)
data1 = r1.text
'''Get the webpage HTML Tags'''
soup1 = BeautifulSoup(data1,features="lxml")
for link1 in soup1.find_all('a'):
lien1 = link1.get('href')
'''We will check if the length is 7 which corresponds to the year and month'''
if len(lien1) == 7:
list_of_href_months.append(loh + lien1 + '/')
for lohm in list_of_href_months:
r2 = requests.get(lohm)
data2 = r2.text
'''Get the webpage HTML Tags'''
soup2 = BeautifulSoup(data2,features="lxml")
for link2 in soup2.find_all('a'):
lien2 = link2.get('href')
'''We will now get all href that contains pagecounts in their name. We will have the files based on Time per hour. So 24 hrs is 24 files
and per year is 24*365=8760 files in minimum'''
if "pagecounts" in lien2:
list_of_href_pageviews.append(lohm + lien2)
existing_file_list = []
for file in os.listdir(self.ptsf):
filename = os.fsdecode(file)
existing_file_list.append(filename)
'''Filter the links'''
matching_fnmatch_list = []
if filter_files != '':
for dfilter in download_filter:
fnmatch_list = fnmatch.filter(list_of_href_pageviews, dfilter)
i = 0
for fnl in fnmatch_list:
'''Break for demo purpose only'''
if self.limit_record != 0:
if (i == self.limit_record) and (i != 0):
break
i += 1
matching_fnmatch_list.append(fnl)
'''If the user stated a filter, we will try to remove the files which are outside that filter in the list'''
to_remove = []
for efl in existing_file_list:
for mloh in matching_fnmatch_list:
if efl in mloh:
to_remove.append(mloh)
'''Lets remove the files which has been found outside the filter'''
for tr in to_remove:
matching_fnmatch_list.remove(tr)
matching_fnmatch_list.sort()
'''Multi Threading of 200'''
p = ThreadPool(200)
p.map(self.multithread_download_files_func, matching_fnmatch_list)
except Exception as e:
print('funct download_files' + str(e))
From the accepted answer, I understood that it is simply replacing from multiprocessing import Pool by from multiprocessing.dummy import Pool.
This worked for me.
I want to do some data analysis on a json file generated from twitter.
The field that I need is the text field which contain the tweet itself.
your contribution is highly appreciated
while tweetCount < maxTweets:
try:
if (max_id <= 0):
if (not sinceId):
new_tweets = api.search(q=search_query, count=tw_block_size)
else:
new_tweets = api.search(q=search_query, count=tw_block_size, since_id=sinceId)
else:
if (not sinceId):
new_tweets = api.search(q=search_query, count=tw_block_size, max_id=str(max_id - 1))
else:
new_tweets = api.search(q=search_query, count=tw_block_size, max_id=str(max_id - 1), since_id=sinceId)
if not new_tweets:
print("Collecte terminee.")
break
for tweet in new_tweets:
day = tweet.created_at.strftime('%Y-%m-%d')
with open( "%s/%s_tweets.json" % (output_dir, day), 'a') as f:
f.write(json.dumps(tweet._json))
f.write('\n')
tweetCount += len(new_tweets)
print("{0} tweets téléchargés".format(tweetCount))
max_id = new_tweets[-1].id
except tweepy.TweepError as e:
print("an error was occured to continue , run the following command:")
print("python collect.py -s {0} -o {1} -u {2}".format(search_query, output_dir, max_id))
print("")
print("Error : " + str(e))
break
I am trying to use the following code to filter a txt file based on the info in 89,90,91 and 92 index.
The problem is that output file is an empty file just with headers.The code is not giving any error-so am not sure how else to go about debugging it.
Thanks for helping!!
for line in fileHandle:
if firstLineFlag == 0: #to skip first line
firstLineFlag = 1
firstLineText = line #save the first line elsewhere
continue
parts = line.strip().split('\t')
try:
column13=float(parts[13-1])
except ValueError:
column13=0
if column13 < 0.01:
if parts[92] == "./.:.:.:.:.":
Nor_info_2 = parts[92].replace("./.:.:.:.:.", "00:1,1:1:1:1,1,1")
if parts[91] == "./.:.:.:.:.":
Nor_info = parts[91].replace("./.:.:.:.:.", "00:1,1:1:1:1,1,1")
if parts[90] == "./.:.:.:.:.":
Tu_info_2 = parts[90].replace("./.:.:.:.:.", "00:1,1:1:1:1,1,1")
if parts[89] == "./.:.:.:.:.":
Tu_info = parts[89].replace("./.:.:.:.:.", "00:1,1:1:1:1,1,1")
normalSplit_2 = parts[92].split(':')
normalSplit = parts[91].split(':')
tumorSplit_2 = parts[90].split(':')
tumorSplit = parts[89].split(':')
print(Nor_info_2)
try:
TD_Tumor_2 = float(tumorSplit_2[3-1])
except ValueError:
TD_Tumor = 0
try:
TD_Tumor = float(tumorSplit[3-1])
except ValueError:
TD_Tumor = 0
try:
TD_Normal_2 = float(normalSplit_2[3-1])
except ValueError:
TD_Tumor = 0
try:
TD_Normal = float(normalSplit[3-1])
except ValueError:
TD_Tumor = 0
if TD_Tumor_2 >= TD_Tumor and TD_Tumor_2 >= 7:
tumorAD=tumorSplit_2[2-1].split(',')
normalAD=normalSplit_2[2-1].split(',')
normalratio=float(normalAD[2-1])/TD_Normal_2
else:
tumorAD=tumorSplit[2-1].split(',')
normalAD=normalSplit[2-1].split(',')
normalratio=float(normalAD[2-1])/TD_Normal
tumorratio=float(tumorAD[2-1])/TD_Tumor_2
parts.append(tumorratio)
parts.append(normalratio)
data.append(parts)
dataz1 = sorted(data, key = itemgetter(91), reverse = True)
#with open('filtered/'+currentFile+'_filtered.txt', 'w') as fileHandle: ## to write your data in proper format
with open(currentFile+'_filtered.txt', 'w') as fileHandle: ## to write your data in proper format
fileHandle.write(firstLineText)
for item in data:
convert_first_to_generator = (str(w) for w in item)
string = '\t'.join(convert_first_to_generator)
string += '\n'
#print string
fileHandle.write(string)
command = 'mv '+currentFile+'_filtered.txt filtered/' ### to move edited files into a different folder
system(command)
I have been fighting with a threaded send of an string image over python sockets for a while now and have had no luck on this issue.
code for the client side is:
import socket
from PIL import ImageGrab #windows only screenshot
from threading import Thread
import win32api, win32con
import re
import win32com.client
import getpass
import time
import select
shell = win32com.client.Dispatch("WScript.Shell")
host = raw_input("SERVER:")
dm = win32api.EnumDisplaySettings(None, 0)
dm.PelsHeight = 800
dm.PelsWidth = 600
win32api.ChangeDisplaySettings(dm, 0)
port = 9000
def picture():
while 1:
image = ImageGrab.grab().resize((800,600)) #send screen as string
data = image.tostring()
sendme = (data)
try:
s.sendall(sendme)
print ("sent")
except socket.error as e:
print e
except Exception as e:
print e
s = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
s.connect((host, port))
pict = Thread(target=picture)
pict.start()
while 1:
socket_list = [s]
# Get the list sockets which are readable
read_sockets, write_sockets, error_sockets = select.select(socket_list , [], [])
for sock in read_sockets:
if sock == s:
data = sock.recv(1024)
print data
if "LEFTC" in data:
data = data.replace("LEFTC","")
x = re.findall(r'X(.*?)Y',data)
y = re.findall(r'Y(.*?)EOC',data)
x = str(x)
y = str(y)
#REPLACE CODE TO BE REWRITTEN
x = x.replace("[","").replace("]","").replace("'","").replace(" ","")
y = y.replace("[","").replace("]","").replace("'","").replace(" ","")
print(str(x) + ' X\n')
print(str(y) + ' Y\n')
try:
win32api.SetCursorPos((int(x),int(y))) #click time
win32api.mouse_event(win32con.MOUSEEVENTF_LEFTDOWN,int(x),int(y),0,0)
win32api.mouse_event(win32con.MOUSEEVENTF_LEFTUP,int(x),int(y),0,0)
except Exception as e:
print e
elif "RIGHTC" in data:
data = data.replace("RIGHTC","")
x = re.findall(r'X(.*?)Y',data)
y = re.findall(r'Y(.*?)EOC',data)
x = str(x)
y = str(y)
#REPLACE FUNCTION MAREKD FOR REWRITE
x = x.replace("[","").replace("]","").replace("'","").replace(" ","")
y = y.replace("[","").replace("]","").replace("'","").replace(" ","")
print(str(x) + ' X\n')
print(str(y) + ' Y\n')
try: #click
win32api.SetCursorPos((int(x),int(y)))
win32api.mouse_event(win32con.MOUSEEVENTF_RIGHTDOWN,int(x),int(y),0,0)
win32api.mouse_event(win32con.MOUSEEVENTF_RIGHTUP,int(x),int(y),0,0)
except Exception as e:
print e
else:
#This does not work correctly: only BACKSPACE and the else are working.
if "CAPS" in data:
shell.SendKeys('{CAPSLOCK}')
elif "CAPSOFF" in data:
shell.SendKeys('{CAPSLOCK}')
elif "BACKSPACE" in data:
shell.SendKeys('{BACKSPACE}')
elif "SHIFT" in data:
shell.SendKeys('+' + data)
else:
shell.SendKeys(data)
time.sleep(0.1)
server code is:
import socket
import pygame
from pygame.locals import *
from threading import Thread
x = y = 0
host = ""
#port defined here
port = 9000
#This list is used to make the library more pythonic and compact. This also leads to less source code.
keylist = [pygame.K_a,pygame.K_b,pygame.K_c,pygame.K_d,pygame.K_e,pygame.K_f,pygame.K_g,pygame.K_h,pygame.K_i,pygame.K_j,pygame.K_k,pygame.K_l,pygame.K_m,pygame.K_n,pygame.K_o,pygame.K_p,pygame.K_q,pygame.K_r,pygame.K_s,pygame.K_t,pygame.K_u,pygame.K_v,pygame.K_w,pygame.K_x,pygame.K_y,pygame.K_z]
key = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','1','2','3','4','5','6','7','8','9','0']
i/o function
def ioinput(sock):
while 1:
evt = pygame.event.poll() #has to be in the same while loop as the evt called or wont work.
if evt.type == pygame.MOUSEBUTTONDOWN and evt.button == 1: # one for left
x, y = evt.pos
command = ("LEFTC" + " " + "X" + str(x) + "Y" + str(y) + "EOC")
sock.sendall(command)
elif evt.type == pygame.MOUSEBUTTONDOWN and evt.button == 3: # 3 for right 2 is middle which support comes for later.
x, y = evt.pos
command = ("RIGHTC" + " " + "X" + str(x) + "Y" + str(y) + "EOC")
sock.sendall(command)
elif evt.type == pygame.KEYDOWN:
keyname = pygame.key.name(evt.key)
if evt.key == pygame.K_BACKSPACE:
command = ("BACKSPACE")
sock.sendall(command)
elif evt.key in keylist:
if keyname in key:
command = (keyname)
sock.sendall(command)
def mainloop():
message = []
while 1:
try:
while True:
try:
conn, addr = server.accept()
except socket.error:
break
screen = pygame.display.set_mode((800,600))
clickctrl = Thread(target=ioinput, args=(conn,))
clickctrl.start()
while 1:
d = conn.recv(1024*1024*1)
if not d:
break
else:
message.append(d)
data = ''.join(message)
image = pygame.image.frombuffer(data,(800,600),"RGB")
screen.blit(image,(0,0))
pygame.display.flip()
except Exception as e:
continue
server = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
server.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
server.setblocking(False)
server.bind((host, port))
server.listen(55000)
print "Listening on %s" % ("%s:%s" % server.getsockname())
Main event loop.
mainloop()
The picture thread will run 3 to six times then die however the keyboard and mouse input layer continues to operate. I suspect that the GIL is getting in my way. Am i correct or am I missing something really simple here? This program is supposed to be a simplistic reverse remote desktop appication.
I found the problem after speaking with a good friend. turns out that my server side while loop was setup so that it would break.
i fixed this by changing:
while 1:
d = conn.recv(1024*1024*1)
if not d:
break
else:
message.append(d)
data = ''.join(message)
image = pygame.image.frombuffer(data,(800,600),"RGB")
screen.blit(image,(0,0))
pygame.display.flip()
to :
while 1:
d = conn.recv(1024*1024*1)
message.append(d)
try:
print("attempting to parse..")
data = ''.join(message)
image = pygame.image.frombuffer(data,(800,600),"RGB")
screen.blit(image,(0,0))
pygame.display.flip()
print("recieved pic")
except Exception as e:
print e
continue
Also, client side on the picture thread i added a time.sleep (1) after the exception handling, otherwise the image does not come though correctly.