Scraping info out of pdf's using Python

Scraping info out of pdf's using Python - python-3.x

I have pdf's distributed over several folders and sub folders.
I've been trying to write a short python script with the idea to search each pdf for any term i enter.
As not all pdf's are searchable, I also tried to implement a list of searchable, and non searchable pdf's with the idea to bring everything in line.
The program seems to work, up to a point. The longer it runs, the slower it goes.
At a certain moment, it just stops. I think it is a memory issue, but i can't seem to find a solution.
The script i have already:
import os
# extracting_text.py
from PyPDF2 import PdfFileReader
search_word = input("enter a word you want to search in file: ")
counter = 0
noTextCounter = 0
SolutionCounter = 0
with open("Solutions.txt", "w") as text_file:
text_file.writelines(f"List of files that contain: {search_word}")
#print(f"List of files that contain: {search_word}", file=text_file)
def text_extractor(path):
with open(path, 'rb') as f:
#variable to find pdf's that only have image. If activated countempty has to be included in the return.
countEmpty = 0
countSolution = 0
pdf = PdfFileReader(f)
# get the first page
page = pdf.getPage(0)
# print(page)
# print('Page type: {}'.format(str(type(page))))
text = page.extractText()
if text == '':
print('No text')
countEmpty = countEmpty + 1
else:
if search_word in text:
print("word found")
countSolution = countSolution + 1
else:
print("word not found")
# print(text)
#Selection of potential returns
#return countEmpty
return countSolution
root = os.getcwd()
try:
for subdir, dirs, files in os.walk(root):
for file in files:
# print os.path.join(subdir, file)
filepath = subdir + os.sep + file
if filepath.endswith(".pdf"):
print(filepath)
counter = counter + 1
print(counter)
if __name__ == '__main__':
path = filepath
indicator = text_extractor(path)
#noTextCounter = noTextCounter + indicator
SolutionCounter = SolutionCounter + indicator
print("indicator: " + str(indicator))
if indicator == 1:
with open("Solutions.txt", "a") as text_file:
text_file.writelines('\n' + path)
#below is option to give 2 lists containing all the pdf's which are images and a list of non images
# #with open("ListOfImagePdfs.txt", "a") as text_file:
# text_file.writelines('\n' + path)
#else:
#with open("ListOfDataPdfs.txt", "a") as text_file:
# text_file.writelines('\n' + path)
#print("amount of image pdf's: " + str(noTextCounter))
except:
pass
#trycatch to be added

Related

Incomplete downloading of instagram followers

I downloaded instaloader from here.
I found the codes here, and I slightly modified it like below:
import os.path
import instaloader
directory = 'c:\\Users\\_Instagram'
os.chdir(directory)
i = 0
filename = "username_%s.txt"
while os.path.exists(filename % i):
i += 1
file_path = os.path.join(directory, filename)
L = instaloader.Instaloader()
# Login or load session
username = "username"
password = "password"
L.login(username, password) # (login)
# Obtain profile metadata
profile = instaloader.Profile.from_username(L.context, username)
# Print list of followees
follow_list = []
count = 0
for followee in profile.get_followers():
follow_list.append(followee.username)
file = open(filename % i, "a+")
file.write(follow_list[count])
file.write("\n")
file.close()
count = count + 1
print(count)
It only writes & counts less than 300 followers when the actual followers count is 4000+.
Can someone help me figure out why? I'm using python 3.7.

while not looping in python

so I have a script with a loop that isn't working and i have no idea why.
i need the menu part to loop until i input 0 how could i do this with a while loop? still learning how to use while loops.
#!/usr/bin/env python3
from ftplib import FTP
host = "localhost"
user = "chris"
password = "qwerty"
ftp = FTP(host,user,password)
#
#current working directory of my ftp
#
ftp.cwd("/home/chris")
#
#list of files
#
files = ftp.nlst()
#
#list length to enter as key values
#
list_length = len(files)
#conversoin of list
def Convert(files):
it = iter(files)
res_dct = dict(zip(range(1,list_length), it))
return res_dct
dico_files = Convert(files)
#
#list of files loop
#
for key in dico_files:
file_list = print(str(key) + ": " + dico_files[key])
#
# menu
#
while selection != 0:
selection = str(input("what file to choose?"))
localfile = open(selection, 'wb')
ftp.retrbinary('RETR ' + selection, localfile.write, 1024)

Expanding the output to the top 5 email addresses (from the top 1) in Python script

my last homework assignment is writing a script that finds the 5 most common email addresses in a text file (linked on hastebin below). I've found a way to find the single most common email address, but how can I expand this output to the top 5? Any help would be greatly appreciated.
while True:
try:
filename = input("Enter a file name: ")
fhand = open(filename, 'r')
email_addresses = {}
for line in fhand:
if line.startswith("From "):
email = line.split()[1]
email_addresses[email] = email_addresses.get(email, 0) + 1
max_address = None
max_emails = 0
for k in email_addresses:
if email_addresses[k] > max_emails:
max_address = k
max_emails = email_addresses[k]
print(max_address, max_emails)
print(email_addresses, email)
ans = input('Do you want to try another file?: (y/n): ')
ans = ans.lower()
if ans == 'y':
continue
if ans == 'n':
print('Thanks for playing!')
break
else:
continue
except:
print('File name',fname,'does not exist.')
continue
And the text file: https://hastebin.com/egixurubak.makefile

A quick idea:
Find the single most common email address in email_addresses, remove it from the dict, store it in a list, find the next single most common email address....

Is it possible to clear a temp file in AWS Lambda?

I have created a lambda that will scan an uploaded file and search for specific phrases which have been listed in another s3 bucket. If a phrase is matched in the original uploaded file, it will print the line of the transcript as well as the response.
This lambda works if we upload each transcript individually, however if we upload more than 1, it stores the original output and adds it to the beginning.
I feel that this issue may be caused by the /tmp/ file not being cleared when the lambda function ends.
Is there a way to clear the /tmp/ file each time a job is done?
The output looks as follows:
ch_0 : Okay. And then, um, how do you guys typically allocate funding for a project like this?
-------------------------------------------------------------
ch_1 : Yeah, we do have capital projects and we've allocated money 3 place, which is and stuff, Um, every year.
ch_0 : Okay. And then, um, how do you guys typically allocate funding for a project like this?
-------------------------------------------------------------
ch_1 : Yeah, we do have capital projects and we've allocated money 3 place, which is and stuff, Um, every year.
ch_0 : Okay. And then, um, how do you guys typically allocate funding for a project like this?
-------------------------------------------------------------
ch_1 : Yeah, we do have capital projects and we've allocated money 3 place, which is and stuff, Um, every year.
However, it should look like this:
ch_0 : Okay. And then, um, how do you guys typically allocate funding for a project like this?
-------------------------------------------------------------
ch_1 : Yeah, we do have capital projects and we've allocated money 3 place, which is and stuff, Um, every year.
My lambda code is as follows:
import boto3
def lambda_handler(event, context):
s3 = boto3.client("s3")
if event:
file_obj = event["Records"][0]
bucketname = str(file_obj['s3']['bucket']['name'])
filename = str(file_obj['s3']['object']['key'])
job_name = filename
print("Filename: ", filename)
fileObj = s3.get_object(Bucket=bucketname, Key=filename)
file_content = fileObj["Body"].read().decode('utf-8')
budget_file = s3.get_object(Bucket= "bantp-phrases", Key="B.txt")
budget_content = budget_file["Body"].read().decode('utf-8')
authority_file = s3.get_object(Bucket= "bantp-phrases", Key="A.txt")
authority_content = authority_file["Body"].read().decode('utf-8')
need_file = s3.get_object(Bucket= "bantp-phrases", Key="N.txt")
need_content = need_file["Body"].read().decode('utf-8')
timeline_file = s3.get_object(Bucket= "bantp-phrases", Key="T.txt")
timeline_content = timeline_file["Body"].read().decode('utf-8')
partner_file = s3.get_object(Bucket= "bantp-phrases", Key="P.txt")
partner_content = partner_file["Body"].read().decode('utf-8')
# Converts all to a list
budgets = budget_content.split("\n")
authorities = authority_content.split("\n")
needs = need_content.split("\n")
timelines = timeline_content.split("\n")
partners = partner_content.split("\n")
lines = file_content.split("\n")
directory_name = filename
mylist = lines
#Budget Phrase Analysis
for b in budgets:
with open("/tmp/budget.txt", "a") as x:
try:
output = None
for index, line in enumerate(lines):
if b.strip() in line:
output = index
break
if output:
x.write("\n" + lines[output] + "\n")
x.write("-------------------------------------------------------------")
x.write("\n" + lines[output +1] + "\n")
print ("It worked!")
break
except (ValueError):
x.write("Nothing found")
print ("It didn't work :(")
break
s3.upload_file(Filename = "/tmp/budget.txt" , Bucket="bantp-analysis", Key = ((directory_name)+'/'+"Budget_" + (filename)))
#Authority Phrase Analysis
for a in authorities:
with open("/tmp/authority.txt", "a") as c:
try:
output = None
for index, line in enumerate(lines):
if a.strip() in line:
output = index
if output:
c.write("\n" + lines[output] + "\n")
c.write("-------------------------------------------------------------")
c.write("\n" + lines[output +1] + "\n")
print ("It worked!")
except (ValueError):
c.write("Nothing found")
print ("It didn't work :(")
s3.upload_file(Filename = "/tmp/authority.txt" , Bucket="bantp-analysis", Key = ((directory_name)+'/'+"Authority_") + (filename))
#Need Phrase Analysis
for n in needs:
with open("/tmp/need.txt", "a") as v:
try:
output = None
for index, line in enumerate(lines):
if n.strip() in line:
output = index
break
if output:
v.write("\n" + lines[output] + "\n")
v.write("-------------------------------------------------------------")
v.write("\n" + lines[output +1] + "\n")
print ("It worked!")
break
except (ValueError):
v.write("Nothing found")
print ("It didn't work :(")
break
s3.upload_file(Filename = "/tmp/need.txt" , Bucket="bantp-analysis", Key = ((directory_name)+'/'+"Need_") + (filename))
#Timeline Phrase Analysis
for t in timelines:
with open("/tmp/timeline.txt", "a") as z:
try:
output = None
for index, line in enumerate(lines):
if t.strip() in line:
output = index
break
if output:
z.write("\n" + lines[output] + "\n")
z.write("-------------------------------------------------------------")
z.write("\n" + lines[output +1] + "\n")
print ("It worked!")
break
except (ValueError):
z.write("Nothing found")
print ("It didn't work :(")
break
s3.upload_file(Filename = "/tmp/timeline.txt" , Bucket="bantp-analysis", Key = ((directory_name)+'/'+"Timeline_") + (filename))
#Partner Phrase Analysis
for p in partners:
with open("/tmp/partner.txt", "a") as q:
try:
output = None
for index, line in enumerate(lines):
if p.strip() in line:
output = index
break
if output:
q.write("\n" + lines[output] + "\n")
q.write("-------------------------------------------------------------")
q.write("\n" + lines[output +1] + "\n")
print ("It worked!")
except (ValueError):
q.write("Nothing found")
print ("It didn't work :(")
s3.upload_file(Filename = "/tmp/partner.txt" , Bucket="bantp-analysis", Key = ((directory_name)+'/'+"Partner_") + (filename))

Welcome to stackoverflow!
Can you try the following solutions and comment the results please
In all your open operation change the mode of opening the file from a to w.
Example
with open("/tmp/timeline.txt", "a") as z:
to
with open("/tmp/timeline.txt", "w") as z:
This change for all open operations, to override the existing metafile. Also do take care of indentation.

Do not process already present data

I have a folder in which there are some video files. I want to extract frames from the videos but only those videos should be processed whose names are not present in the csv. It should check for the present video file names in the csv before processing the videos
def extractFrames(m):
global vid_name
vid_files=glob(m)
print(vid_files)
complete_videos = get_completed_videos()
print(complete_videos)
new_vid_files = [x for x in vid_files if get_vid_name(x) not in complete_videos]
for vid in new_vid_files:
print("path of video========>>>>.",vid)
v1=os.path.basename(vid)
try:
vid_name = get_vid_name(vid)
vidcap = cv2.VideoCapture(vid)
except cv2.error as e:
print(e)
except:
print('error')
#condition
fsize=os.stat(vid)
print('=============size of video ===================:' , fsize.st_size)
try:
if (fsize.st_size > 1000):
fps = vidcap.get(cv2.CAP_PROP_FPS) # OpenCV2 version 2 used "CV_CAP_PROP_FPS"
frameCount = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = frameCount/fps
minutes = int(duration/60)
print('fps = ' + str(fps))
print('number of frames = ' + str(frameCount))
print('duration (S) = ' + str(duration))
if (duration > 1):
success,image = vidcap.read()
count=0
success=True
while success:
img_name = vid_name + '_f' + str(count) + ".jpg"
success,image = vidcap.read()
if count % 10 == 0 or count ==0:
target_non_target(img_name, image)
count+=1
vidcap.release()
cv2.destroyAllWindows()
except:
print("error")
print('finished processing video ',vid)
with open("C:\\multi_cat_3\\models\\research\\object_detection\\my_imgs\\"+'video_info.csv', 'a') as csv_file:
fieldnames = ['Video_Name','Process']
file_is_empty = os.stat("C:\\multi_cat_3\\models\\research\\object_detection\\my_imgs\\"+'video_info.csv').st_size == 0
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
if file_is_empty:
writer.writeheader()
writer.writerow({'Video_Name':vid_name,'Process':'done'})
def get_vid_name(vid):
return os.path.splitext(os.path.basename(vid))[0]
def get_completed_videos():
completed_videos = []
with open("C:\\multi_cat_3\\models\\research\\object_detection\\my_imgs\\video_info.csv") as csv_file:
for row in csv.reader(csv_file):
for col in range(0,len(row)):
try:
completed_videos.append(row[col])
except Exception as e:
print(str(e))
print(completed_videos[0])
return completed_videos
Suppose there are 3 videos in a folder. Code is successfully run for those 3 videos and their names are written in the csv. Now if i paste video number 4 in the folder then it should process only the 4th video after checking for the video names present in the csv. Currently it is repeatedly processing all the video files everytime the script is run.

First off, inside the for loop
v1=os.path.basename(vid_files[v_f])
Should be
v1=os.path.basename(new_vid_files[v_f])
Since you are looping over the new_vid_files range. Using those indices on the original list will give you unexpected items. Better yet, you can directly use a for-each loop (since you don't seem to be using v_f for anything other than list access) as follows:
for vid in new_vid_files:
And this vid would replace all instances of new_vid_files[v_f].
Next, you are using vid_name to write to the csv, so you need to perform the same operation for each item from vid_files before matching against complete_videos while creating the new_vid_files list.
If you create a method for getting the video name as follows:
def get_vid_name(vid_file):
return os.path.splitext(os.path.basename(vid_file))[0]
Then you can change the list comprehension to be
new_vid_files = [x for x in vid_files if get_vid_name(x) not in complete_videos]
Edit: As mentioned in the comments to the other answer, the output for complete_videos indicates it isn't being parsed properly. It is appending both the column headers and other unneeded columns. This code will work despite that, but it needs to be fixed. I am not solving it because it is a relatively simple change, and I want the OP to understand what they're doing wrong.

def extractFrames(m):
global vid_name
vid_files=glob(m)
print(vid_files)
complete_videos = get_completed_videos()
new_vid_files = [x for x in vid_files if get_vid_name(x) not in complete_videos]
for vid in new_vid_files:
print("path of video========>>>>.",vid)
v1=os.path.basename(vid)
try:
vid_name = get_vid_name(vid)
vidcap = cv2.VideoCapture(vid)
except cv2.error as e:
print(e)
except:
print('error')
#condition
fsize=os.stat(vid)
print('=============size of video ===================:' , fsize.st_size)
try:
if (fsize.st_size > 1000):
fps = vidcap.get(cv2.CAP_PROP_FPS) # OpenCV2 version 2 used "CV_CAP_PROP_FPS"
frameCount = int(vidcap.get(cv2.CAP_PROP_FRAME_COUNT))
duration = frameCount/fps
minutes = int(duration/60)
print('fps = ' + str(fps))
print('number of frames = ' + str(frameCount))
print('duration (S) = ' + str(duration))
if (duration > 1):
success,image = vidcap.read()
count=0
success=True
while success:
img_name = vid_name + '_f' + str(count) + ".jpg"
success,image = vidcap.read()
if count % 10 == 0 or count ==0:
target_non_target(img_name, image)
count+=1
vidcap.release()
cv2.destroyAllWindows()
except:
print("error")
print('finished processing video ',vid)
with open("C:\\multi_cat_3\\models\\research\\object_detection\\my_imgs\\"+'video_info.csv', 'a') as csv_file:
fieldnames = ['Video_Name','Process']
file_is_empty = os.stat("C:\\multi_cat_3\\models\\research\\object_detection\\my_imgs\\"+'video_info.csv').st_size == 0
writer = csv.DictWriter(csv_file, fieldnames=fieldnames)
if file_is_empty:
writer.writeheader()
writer.writerow({'Video_Name':vid,'Process':'done'})
def get_vid_name(vid):
return os.path.splitext(os.path.basename(vid))[0]
def get_completed_videos():
completed_videos = []
with open("C:\\multi_cat_3\\models\\research\\object_detection\\my_imgs\\video_info.csv") as csv_file:
for row in csv.reader(csv_file):
for col in range(0,len(row)):
try:
completed_videos.append(row[col])
except Exception as e:
print(str(e))
print(completed_videos[0])
return completed_videos

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

Scraping info out of pdf's using Python - python-3.x

Related

Incomplete downloading of instagram followers

while not looping in python

Expanding the output to the top 5 email addresses (from the top 1) in Python script

Is it possible to clear a temp file in AWS Lambda?

Do not process already present data

Categories

Resources