Why can't i extract correctly the image from this pdf? [Please need help] - python-3.x

I am currently working on OCR on pdf files. Here is my pipeline:
i first extract image from pdf (since my pdf contained scanned document) and convert in numpy array
then i read with tesseract
It works pretty well on most of my image but i have sevral whose i can't extract the image inside. I just gave an example and i can't find (see next) the scanned image containing the writing part (for OCR). It drive me crazy (where has it gone ??).
Perhaps you could help me to retrieve that image and understand why my way do not let me retrieve this image "fantôme" ?
NB: i noticed that thoses problematic images inside the pdf are in "jpx" format.
Edit: Since the image is unfindable in the pdf i tried an horrible trick (waiting for clever explanation :) ): converting whole pdf page in pix (PyMuPdf let do that) and then writing the PIX on disk in different format (PNG, TIFF). The quality is too much degraded compared with the original pdf (so we can forget a reasonnable reading with Tesseract).
Here is the pdf example file (if you have simpler hosting way i am curious): https://www.filehosting.org/file/details/906817/IB00058815877D0000000.pdf
Here are the 2 images i extract from the file (the second one should contain txt instead of garbage)
Here is my code to extract images:
import fitz
import os
import logging
import cv2
from PIL import Image
from .utils import lazyproperty,showpdf
from .imhelpers import show
from ..config import myconfig
from impocr import logger
import pytesseract
pytesseract.pytesseract.tesseract_cmd = myconfig.TESSERACT_CMD
class InvalidImage(Exception):
pass
class PDFParser():
"""
"""
def __init__(self,filepath,page_num=0):
self.filepath = filepath
self.filename = os.path.basename(self.filepath).split('.pdf')[0]
try:
self._doc = fitz.open(filepath)
self.page_num = page_num
self._page = self._doc[page_num]
except Exception as e:
print("Lecture PDF impossible. {}".format(e))
raise
#lazyproperty
def text(self):
return self._page.getText()
#lazyproperty
def _pixs(self):
imgs = self._doc.getPageImageList(self.page_num)
pixs =[]
for img in imgs:
xref = img[0]
pix = fitz.Pixmap(self._doc, xref)
pixs.append(pix)
return pixs
#lazyproperty
def _pixpage(self):
pix = self._page.getPixmap(colorspace=fitz.csGRAY)
return pix
#property
def img(self):
return self.imgs[0]
#property
def pageimg(self):
pix = self._pixpage
return self.pix2np(pix)
#lazyproperty
def imgs(self):
pixs = self._pixs
imgsarray = []
for pix in pixs:
img = self.pix2np(pix)
imgsarray.append(img)
return imgsarray
def find_first_valid_image(self):
img_valid = None
for i,img in enumerate(self.imgs):
try:
import ipdb;ipdb.set_trace()
res = pytesseract.image_to_osd(img)
img_valid = img
return img_valid
except pytesseract.TesseractError:
continue
if img_valid==None:
logger.warning('No readable image in page {} of the document {}'.format(self.page_num, self.filename))
raise InvalidImage('No readable image in page {} of the document {}'.format(self.page_num, self.filename))
def write(self,outputdir,fullpage=False):
try:
os.makedirs(outputdir)
logger.info("Directory {} is created".format(outputdir))
except FileExistsError:
pass
def _writepix(pix,filepath):
# This is GRAY or RGB
try:
pix.writePNG(filepath)
# CMYK: convert to RGB first
except:
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.writePNG(filepath)
pix = None
if fullpage:
filepath = os.path.join(outputdir,'{}_p{}.png'.format(self.filename,self.page_num))
pix = self._pixpage
_writepix(pix,filepath)
return
pixs = self._pixs
for i,pix in enumerate(pixs):
filepath = os.path.join(outputdir,'{}_p{}_i{}.png'.format(self.filename,self.page_num,i))
_writepix(pix,filepath)
return
def pix2np(self,pix):
"""
Convert pixmap to image np.ndarray
https://stackoverflow.com/questions/53059007/python-opencv
param pix: pixmap
"""
import numpy as np
#https://stackoverflow.com/questions/22236749/numpy-what-is-the-difference-between-frombuffer-and-fromstring
im = np.frombuffer(pix.samples, dtype=np.uint8).reshape(pix.h, pix.w, pix.n)
try:
im = np.ascontiguousarray(im[..., [2, 1, 0]]) # rgb to bgr
except IndexError:
#Trick to convert Gray rto BGR, (im.reshape)
#logger.warning("Need to convert Gray to BGR [filepath: {}]".format(self.filepath))
im = cv2.cvtColor(im,cv2.COLOR_GRAY2RGB)
im = np.ascontiguousarray(im[..., [2, 1, 0]])
return im
if __name__ == "__main__":
filepath = r'data\inputs\test\impot_textpdf_with_one_logoimage.pdf'
###### Parse page 0 (first page) ######
pdf = PDFParser(filepath,0)
text = pdf.text
imgs = pdf.imgs
show(pdf.imgs[0])
show(pdf.imgs[1])
############### other functions ####################
class lazyproperty:
def __init__(self, func):
self.func = func
def __get__(self, instance, cls):
if instance is None:
return self
else:
value = self.func(instance)
setattr(instance, self.func.__name__, value)
return value
def show(image):
import matplotlib.pyplot as plt
fig,ax = plt.subplots(1)
ax.imshow(image)
plt.show()

My solution is not so good (but waiting for better ideas but here are my 2 cents idea: i first write the full page and read it with opencv (i changed the method first_valid_image as you can see attached).
from tmpfile import TemporaryDirectory
def find_first_valid_image(self):
#import ipdb;ipdb.set_trace()
img_valid = None
for i,img in enumerate(self.imgs):
try:
#import ipdb;ipdb.set_trace()
res = pytesseract.image_to_osd(img)
img_valid = img
return img_valid
except pytesseract.TesseractError:
continue
if img_valid==None:
logger.warning('No readable image in page {} of the document {}. Tried the fullpage.'.format(self.page_num, self.filename))
with TemporaryDirectory() as tmpdirname:
filepath = self.write(tmpdirname,fullpage=True)
img_fullpage =cv2.imread(filepath)
return img_fullpage
I think it degrade the quality of my original image; so when applying tesseract on the image i got a bad ocr as you can see attached.
"""DIRECTION GÉNÉRALE DE6 FNANCES PUBLIQUES\n\nAVIS D'IMPÔT 2017\nIMPÔT SUR LES REVENUS\nd Fannée 2016\n\n \n\nPour vos _ démarches,\npas besoin doiginal —\nMc d furir un —\nphotocopie, vérifiable sur —\nTmpots gouv vn\n\nVotre situation\n\n \n\nVos rétérences.\n\nPour accéder à votre espace partculior MONTANT À PAYER\nNuméro fiscal | | A us ario 15/00/2017 (41)\n\nN* daccès en ligne voirvouo déciaration | | Détail du montant à payer\nRevenu fiscal d référence Montart de vtr impôt su e revors\n# | Rétéronce de 'avis <VRRRRS | Versemens sur 1er acompte\nVersomontssur 26 acompto\n\nNuméro F —\n\nNuméro de rôle 016 A\nDate c'étaissement 2m0762017|\nDate de mise en recouvrement 3vo7æ2017|\n\n \n\n \n\n \n\n3899,00 €\n3893006\n\n \n\n \n\nLa somme que vous davez payer est supérieure à 2 000 €\nLa loirend obligatoie le paiement de cette somme par un des moyens suivants, à votre choix :\n\nur impots.gouv.fr: payez en igne ou adhérez au prélèvement à léchéance en vous connectant à vore\nspaco pariclor, pislissoz-vous guider\n\npartéléphone, courrir où couriel pour adhérer au prélèvement à échéanco (aux coordonnéesindiquées\ndansle cadre - Vos démarches »\n\nPour 2018,vous pourrez achérerau prélèvement mensue\n\x0c"""

Related

cv2 wait key (Hoping to make it variable depending on latency)

When using my code bellow (It turns YouTube videos into ASCII with audio) The latency between the audio and video grows bigger each frame (I have tried many different wait times) I was wondering if there is a way to change the code to make it so the wait key changes depending on how much latency there is. I have only been coding for 6 months so sorry if there is any bad code.
import pytube
import os
import cv2
import PIL.Image
import winsound
from moviepy.editor import *
from pydub import AudioSegment
import threading
import time
##################################################
# downloads the youtube video and lets you input the path for where it should be saved
url = input ("Enter the you youtube url: \n\n")
path = input ("Enter the path where you want the youtube video to be saved: \n\n")
try:
youtube = pytube.YouTube(url)
streams = youtube.streams.all()
video = youtube.streams.get_highest_resolution()
video.download(path)
print ("Done!")
except:
print ("\nYoutube video has coppy righted material so it can not be downloaded. Try again with a different video")
##################################################
#locates all the files with the file extension .mp4
for root, dirs, files in os.walk(path):
for file in files:
if file.endswith(".mp4"):
file_name = (file)
file_path = (os.path.join(root,file))
print (file_name)
print (file_path)
##################################################
mp3_file = (path+"\\")
mp3_file = (mp3_file+"audio.mp3")
mp4_file = (path+"\\")
mp4_file = (mp4_file+file_name)
VideoClip = VideoFileClip(mp4_file)
audioclip = VideoClip.audio
audioclip.write_audiofile(mp3_file)
audioclip.close()
VideoClip.close()
sound = AudioSegment.from_mp3(mp3_file)
sound.export(path+"/audio.wav", format = "wav")
##################################################
def a():
# Ascii characters used to create the output
ASCII_CHARS = ["#", "#", "S", "%", "?", "*", "+", ";", ":", ",", "."]
def resized_gray_image(image ,new_width=80):
width,height = image.size
aspect_ratio = height/width
new_height = int(aspect_ratio * new_width)
resized_gray_image = image.resize((new_width,new_height)).convert('L')
return resized_gray_image
def pix2chars(image):
pixels = image.getdata()
characters = "".join([ASCII_CHARS[pixel//25] for pixel in pixels])
return characters
def generate_frame(image,new_width=80):
new_image_data = pix2chars(resized_gray_image(image))
total_pixels = len(new_image_data)
ascii_image = "\n".join([new_image_data[index:(index+new_width)] for index in range(0, total_pixels, new_width)])
sys.stdout.write(ascii_image)
os.system('cls' if os.name == 'nt' else 'clear')
cap = cv2.VideoCapture(mp4_file)
print (cap)
try:
while True:
ret,frame = cap.read()
cv2.imshow("frame",frame)
generate_frame(PIL.Image.fromarray(frame))
cv2.waitKey(1)
except:
threading.Thread(target=c).start()
##################################################
def b():
winsound.PlaySound(path+"/audio.wav",winsound.SND_FILENAME)
##################################################
def c ():
os.remove (mp3_file)
os.remove (mp4_file)
os.remove (path+"/audio.wav")
threading.Thread(target=a).start()
threading.Thread(target=b).start()

raspberry pi3 b+ slideshow program not working

I wanted to turn my raspberry pi into a slideshow device but I barely know python 3 could anyone help me fix this code I found online? It's supposed to display the images in the list I created but it doesn't run. The only changes I made to this code was the creation of the list so I have no idea why it does not work. The error I'm getting is
Traceback (most recent call last):
File "bro.py", line 82, in <module>
slideshow.start()
File "bro.py", line 56, in start
self.main()
File "bro.py", line 48, in main
self.set_image()
File "bro.py", line 42, in set_image
self.image_name = next(self.images)
StopIteration
Here is the code:
#!/usr/bin/env python3
"""Display a slideshow from a list of filenames"""
import os
import tkinter
from itertools import cycle
from PIL import Image, ImageTk
class Slideshow(tkinter.Tk):
"""Display a slideshow from a list of filenames"""
def __init__(self, images, slide_interval):
"""Initialize
images = a list of filename
slide_interval = milliseconds to display image
"""
tkinter.Tk.__init__(self)
self.geometry("+0+0")
self.slide_interval = slide_interval
self.images = images
self.set_images(images)
self.slide = tkinter.Label(self)
self.slide.pack()
def set_images(self, images):
self.images = cycle(images)
def center(self):
"""Center the slide window on the screen"""
self.update_idletasks()
w = self.winfo_screenwidth()
h = self.winfo_screenheight()
size = tuple(int(_) for _ in self.geometry().split('+')[0].split('x'))
x = w / 2 - size[0] / 2
y = h / 2 - size[1] / 2
self.geometry("+%d+%d" % (x, y))
def set_image(self):
"""Setup image to be displayed"""
self.image_name = next(self.images)
filename, ext = os.path.splitext(self.image_name)
self.image = ImageTk.PhotoImage(Image.open(self.image_name))
def main(self):
"""Display the images"""
self.set_image()
self.slide.config(image=self.image)
self.title(self.image_name)
self.center()
self.after(self.slide_interval, self.start)
def start(self):
"""Start method"""
self.main()
self.mainloop()
if __name__ == "__main__":
slide_interval = 2500
# use a list
images = [
"/~/definitely/1550099164562.png",
"/~/definitely/1550770995551.png",
"/~/definitely/1550771217013.png",
"/~/definitely/1550771726391.jpg"]
# all the specified file types in a directory
# "." us the directory the script is in.
# exts is the file extentions to use. it can be any extention that pillow supports
# http://pillow.readthedocs.io/en/3.3.x/handbook/image-file-formats.html
import glob
images = glob.glob("*.jpg")
path = "."
exts = ["jpg", "bmp", "png", "gif", "jpeg"]
images = [fn for fn in os.listdir(path) if any(fn.endswith(ext) for ext in exts)]
# start the slideshow
slideshow = Slideshow(images, slide_interval)
slideshow.start()
Try next(iter(images)) instead of next(images).
BTW, I have Raspberry Pi slideshows for both Tkinter and Kivy on my tachyonlabs GitHub ... they also download photos from Instagram, but if you don't hook that up or there's no Internet connection they will just cycle through the images in the image directory you specify, with your chosen order and duration.

'bool' object not iterable

I am working on python3, opencv 3.4 and using Microsoft Azure's FaceAPI function 'CF.face.detect()'
As far as I know, 'for loop' needs iterable object to run on like list but simple boolean is not iterable. Though 'res1' is a list I get this error.
TypeError: 'bool' object not iterable
Please help, Thanks in advance
Here is the code:
import unittest
import cognitive_face as CF
from PIL import Image, ImageFont, ImageDraw
import time
import cv2
from time import strftime
CF.Key.set('')
#print(CF.Key.get())
CF.BaseUrl.set('https://southeastasia.api.cognitive.microsoft.com/face/v1.0/')
#print(CF.BaseUrl.get())
"""Setup Person and Person Group related data."""
person_group_id = '' #id from training terminal
"""Unittest for `face.detect`."""
cap = cv2.VideoCapture('1.mp4')
while(cap.isOpened()):
ret, img = cap.read()
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
print("\n\n ##########.... LOOKING FOR FACES ....########## \n\n")
res1 = []
print(type(res1))
res1 = CF.face.detect(cap)
print('\n This is the res1: ', res1)
c = len(res1)
print('\nTOTAL FACES FOUND:', c)
detect_id = [] ##error was here so put exception
for i in range(c):
print("\n\n ##########.... DETECTING FACES ....########## \n\n")
print('\n This is i in range c', i, c)
detect_id.append(res1[i]['faceId'])
#print('\n\n detected faces id ', detect_id[i])
width = res1[i]['faceRectangle']['width']
height = res1[i]['faceRectangle']['height']
x = res1[i]['faceRectangle']['left']
y = res1[i]['faceRectangle']['top']
################## IF ENDS #########################################################################
cv2.imshow('image',img)
k = cv2.waitKey(100) & 0xff
if k == 27:
break
################ WHILE ENDS ####################################
cap.release()
cv2.destroyAllWindows()
#Jonasz is right, you should be detecting faces on images, meaning, in frames from your mp4 file.
The method CF.face.detect expects an URI, so in the following code we'll write it to disk before pass it onto CF.face.detect:
cap = cv2.VideoCapture('1.mp4')
count = 0 # <--
while(cap.isOpened()):
ret, img = cap.read()
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
filename = "frame%d.jpg" % count # <--
cv2.imwrite(filename, img) # <--
count+=1 # <--
print("\n\n ##########.... LOOKING FOR FACES ....########## \n\n")
res1 = []
print(type(res1))
res1 = CF.face.detect(filename) # <--
Shouldn't you use CF.face.detect on your captured image not on cap variable?

Passing Scrapy response URL to Selenium and then Selenium response back to Scrapy

How to pass a Scrapy response url to Selenium and then selenium response back to Scrapy
I have this Scrapy spider first.py :
# -*- coding: utf-8 -*-
import scrapy
import re
import json
class FirstSpider(scrapy.Spider):
name = "first"
allowed_domains = ["someautosite.co.uk"]
start_urls = (
'http://www.someautosite.co.uk/some_specific_search_results',
)
def parse(self, response):
for car_url in response.xpath('//article[contains(#class, "standard")]/div/div[2]/div[1]/h1/a/#href').extract():
absoluteurl = response.urljoin(car_url)
# yield {'URL': absoluteurl}
yield scrapy.Request(absoluteurl, callback=self.parse_car)
def parse_car(self, response):
pattern = re.compile(r"var utag_data = ({.*?});", re.MULTILINE | re.DOTALL)
utag_data = response.xpath('//script[contains(.,"var utag")]/text()').re(pattern)[0]
utag_data_obj = json.loads(utag_data)
# make = utag_data_obj['make']
# model = utag_data_obj['model']
# yield {'Make':utag_data_obj['make'],
# 'model':utag_data_obj['model'],
# }
# yield utag_data
tel = response.xpath('//article/div[3]/section/div/div[#itemprop="telephone"]/text()').extract_first()
# tel_json_str = '{"tel":"' + str(tel) + '"}'
# tel_json_obj = json.loads(tel_json_str)
# Combine 2 JSON objects into one:
car_json = utag_data_obj.copy()
car_json.update({"tel": tel})
yield car_json
quotations_url = response.xpath('/html/body/article/section/ul/li[2]/a/#href').extract_first()
yield scrapy.Request(quotations_url, callback=self.parse_quotations)
def parse_quotations(self, response): # parse insurance quotation website link with selenium
import filldata2
and then I have a Selenium filldata2.py module which tries to get a quotation for a car from a url link which is extracted in the parse_car method from the scrapy spider code above.
Now the selenum module starts like this :
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC # available since 2.26.0
from selenium.webdriver.common.keys import Keys
import time
import six
from six.moves.configparser import SafeConfigParser
regno = 'AA00AAA'
mile = '15000'
firstname = 'John'
lastname = 'Smith'
[...]
def yesno(idul):
idxpath = '//*[#id="{}"]'.format(idul)
return idxpath
def findid(idul):
found = driver.find_element_by_id(idul)
return found
def clickyes(idul):
idxpath = '//*[#id="{}"]'.format(idul)
arg = '{}//span[contains(text(), "Yes")]'.format(idxpath)
return driver.find_element_by_xpath(arg).click()
def clickno(idul):
idxpath = '//*[#id="{}"]'.format(idul)
arg = '{}//span[contains(text(), "No")]'.format(idxpath)
return driver.find_element_by_xpath(arg).click()
def clickspan(idul):
idxpath = '//*[#id="{}"]'.format(idul)
arg = '{}//span[1]'.format(idxpath)
driver.find_element_by_xpath(arg).click()
class DivSelect(object):
def __init__(self, idul, divtext):
self.idul = idul
self.divtext = divtext
# exemplu: '//div[contains(text(), "Right Hand")]'
# self.divulxpath = '//div[contains(text(), "{}")]'.format(self.divtext)
self.idxpath = '//*[#id="{}"]'.format(self.idul)
def findid(self):
el = 'driver.find_element_by_id({})'.format(self.idul)
return el
#property
def clicky(self): # merge doar la selectare de divuri
if len(str(self.divtext)) >= 2 and not self.divtext.isdigit():
arg = '{}//div[contains(text(), "{}")]'.format(self.idxpath, self.divtext)
else:
arg = '{}//div[{}]/label/div'.format(self.idxpath, self.divtext)
print('driver.find_element_by_xpath("{}").click()'.format(arg))
driver.find_element_by_xpath(arg).click()
def printval(cee, cssid):
def getval():
val = driver.find_element_by_xpath('//*[#id="{}"]'.format(cssid)).get_attribute('value')
if not val:
val = input('Care e valoarea masinii:\n')
driver.find_element_by_xpath('//*[#id="{}"]'.format(cssid)).click()
fillin(cssid, val)
time.sleep(2)
# print(val)
# assert isinstance(val, object)
return val
valoare = getval()
if valoare.lower() == 'pret':
print('{} estimat este : £ {} '.format(cee, valoare)) if valoare else 'Nu era nici un {}({}) estimat'.format(
cee, cssid)
else:
print('{} estimat/a/e este : {} '.format(cee, valoare)) if valoare else 'Nu era nici un {}({}) estimat'.format(
cee, cssid)
def clickbutton(cssid):
driver.find_element_by_xpath('//*[#id="{}"]'.format(cssid)).click()
def fillin(cssid, var):
return driver.find_element_by_id(str(cssid)).send_keys(var)
def fillinsugestionbox(cssid, var):
driver.find_element_by_id(str(cssid)).send_keys(var)
return driver.find_element_by_xpath('//*[#id=\"{0}\"]'.format(cssid)).send_keys(Keys.RETURN)
knowsRegistrationNumber = Yesno('knows-registration-number').clickyes
# 1.2 Then please enter it here to get started:
registrationNumber = driver.find_element_by_id('registration-number')
registrationNumber.send_keys(regno)
# 1.3 Find your vehicle find-vehicle-by-reg
findVehicleByReg = driver.find_element_by_id('find-vehicle-by-reg')
findVehicleByReg.click()
time.sleep(1)
# TODO : if no other variants
# 1.3.1 multiple-vehicles-section : a select list with more options
# multipleVehiclesSection = driver.find_element_by_id('multiple-vehicles-section')
# multipleVehiclesSection.click()
# possible-vehicles : the select list id
try:
element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "possible-vehicles")))
possibleVehicles = driver.find_element_by_id('possible-vehicles')
possibleVehicles.click()
print('am asteptat destul')
dropdown = possibleVehicles.find_elements_by_tag_name('option')
print('Am selectat :\n {} \n dintre urmatoarele:'.format(dropdown[1].text))
for option in dropdown[1:]:
print(option.text)
if dropdown:
dropdown[1].click()
except:
print('Elementul possible-vehicles nu e prezent')
# finally:
time.sleep(2)
# //*[#id="has-tracker"] Yes/No
hasTracker = Yesno('has-tracker').clickno
# //*[#id="imported"] Yes/No
imported = Yesno('imported').clickno
# //*[#id="steering"] - 2 Divs
# Choose from options :
# Left Hand or # Right Hand
steering = DivSelect('steering', 'Right Hand').clicky
# TODO: vezi ce faci daca nu are pret setat. Pune tu unul
# //*[#id="current-value"] - citeste valoarea
# driver.find_element_by_xpath('//*[#id="current-value"]')
printval('Pret', 'current-value')
# print('Pretul estimat este : £ {} '.format(currentValue)) if currentValue else 'Nu era nici un pret estimat'
printval('scaune', 'numberOfSeats-dropdown')
# //*[#id="has-modifications"]
hasModifications = Yesno('has-modifications').clickno
# clik next button
# //*[#id="vehicle-lookup-next"]
clickbutton('vehicle-lookup-next')
time.sleep(1)
# ============================================
# 2. Vehicle usage |
# ============================================
# 2.1 When did you buy the car?
# //*[#id="vehicle-usage"]//span[1]
vehicleUsage = Yesno('vehicle-usage').clickspan # I haven't bought this car yet
# 2.2 What do you use your car for?
# //*[#id="use-of-vehicle"]/ol/li[2]/div[2]/label/div/div[2]
# //*[#id="use-of-vehicle"]//div[2]
useOfVehicle = DivSelect('use-of-vehicle', '2').clicky # Social, Domestic, Pleasure and Commuting (SDPC)
# 2.3 What would you say your annual personal mileage is?
# //*[#id="annual-mileage"]
annualMileage = driver.find_element_by_id('annual-mileage')
annualMileage.send_keys(mile)
[...much more...]
...
...
fillin('email', email)
# Main telephone number
# Let the insurance providers answer your queries
# Let us keep you up to date
# //*[#id="communication-options"]/ol/li[2]/div[4]/label/div/div[2]
DivSelect('communication-options', 'Post').clicky
# Please tick this box to confirm you have read and understood our website Terms and Conditions, \
# any assumptions we may have made and Your Rewards Terms and Conditions. \
# If you do not understand any items within this document please contact us.
# //*[#id="contact-details"]/div/ol/li[6]/ol/li[2]/div[2]/label/span
# Yesno('contact-details').clickspan - nu merge
driver.find_element_by_xpath('//*[#id="contact-details"]/div/ol/li[6]/ol/li[2]/div[2]/label/span').click()
# //*[#id="contact-details-next"]
clickbutton('contact-details-next')
driver.implicitly_wait(10)
try:
element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "quotes")))
print('element = ', element)
try:
"""
wait for loading bar to go away:
"""
element2 = WebDriverWait(driver, 60).until(EC.invisibility_of_element_located((By.XPATH, '//*[#id="quotes-loading-container"]/div/div[1]')))
print('element2 = ', element2)
except:
print('bara de loading inca este activa. butonul more details cu cotatii nu e vizibil')
except:
print('tabelul cu cotatii nu e vizibil')
source_code = driver.find_element_by_id('quotes').get_attribute('innerHTML')
# element.get_attribute('innerHTML')
f = open('C:\\Users\\ZZZ\\PycharmProjects\\selenscrapy\\'+str(regno)+'.html', 'wb')
f.write(source_code.encode('utf-8'))
f.close()
I know the code is messy. I'm a python beginner and I'm playing with this code to scrap some cars from a car selling website and I try to get a insurance quotation for them from a different site. The link for the outer insurance quotation site (full of javascript, that's why i need Selenium webdriver) is a redirection link from the car selling site, because the 2 sites collaborate.
Now, as I said before, this quotation url needs to be parsed by selenium, which I would like to keep it as on separate module file, maybe even 2 separate files, one with config and one with actions to be taken.
How do I pass the insurance quotation URL obtained from scrapy FirstSpider parse_car() method to selenium module and the response of selenium script (which is called source_code in the second module above) back to scrapy in the FirstSpider parse_quotations() method.
Thank you !
Instead of yield a request for quotations_url in first.py, can you create a Selenium webdriver and start scraping in the webdriver?
def parse_car(self, response):
...
quotations_url = response.xpath('/html/body/article/section/ul/li[2]/a/#href').extract_first()
# Start to work in a webdriver
browser = webdriver.Chrome()
browser.get(quotations_url)
# ... do whatever you want in the webdriver ...
# yield your item

error with openCV2

I am new in coding. Using this script:
from PIL import Image
from PIL.ImageChops import subtract
import numpy, math, time, glob, sys, os, logging, requests, random
def GreenScreen(infile, inbg ,outfile='output.png', keyColor=None,tolerance=None):
"""
http://gc-films.com/chromakey.html
http://www.cs.utah.edu/~michael/chroma/
:param infile: Greenscreen image location
:param inbg: Background image location
:param outfile: Output file location
:param keyColor: greenscreen color; it can be any singular color
:param tolerance: tolerance of cleaning
:return:
"""
if not keyColor:
keyColor = [151,44,21] #Y,Cb, and Cr values of the greenscreen
if not tolerance:
tolerance = [100,130] #Allowed Distance from Values
#open files
inDataFG = Image.open('/home/leonardo/Scrivania/in/KVfnt.png').convert('YCbCr')
Path = '/home/leonardo/Scrivania/background/'
FullPath = os.path.join(Path, random.choice(os.listdir(Path)))
BG = Image.open(FullPath).convert('RGB')
[Y_key, Cb_key, Cr_key] = keyColor
[tola, tolb]= tolerance
(x,y) = inDataFG.size #get dimensions
foreground = numpy.array(inDataFG.getdata()) #make array from image
maskgen = numpy.vectorize(colorclose) #vectorize masking function
alphaMask = maskgen(foreground[:,1],foreground[:,2] ,Cb_key, Cr_key, tola, tolb) #generate mask
alphaMask.shape = (y,x) #make mask dimensions of original image
imMask = Image.fromarray(numpy.uint8(alphaMask))#convert array to image
invertMask = Image.fromarray(numpy.uint8(255-255*(alphaMask/255))) #create inverted mask with extremes
#create images for color mask
colorMask = Image.new('RGB',(x,y),tuple([0,0,0]))
allgreen = Image.new('YCbCr',(x,y),tuple(keyColor))
colorMask.paste(allgreen,invertMask) #make color mask green in green values on image
inDataFG = inDataFG.convert('RGB') #convert input image to RGB for ease of working with
cleaned = subtract(inDataFG,colorMask) #subtract greens from input
BG.paste(cleaned,imMask)#paste masked foreground over background
# BG.show() #display cleaned image
BG.save(outfile, "JPEG") #save cleaned image
def colorclose(Cb_p,Cr_p, Cb_key, Cr_key, tola, tolb):
temp = math.sqrt((Cb_key-Cb_p)**2+(Cr_key-Cr_p)**2)
if temp < tola:
z = 0.0
elif temp < tolb:
z = ((temp-tola)/(tolb-tola))
else:
z = 1.0
return 255.0*z
def check_folders(logger):
if not os.path.exists('out/'):
os.mkdir('out/')
if not os.path.exists('background/'):
os.mkdir('background/')
logger.error("Place background images in background/")
sys.exit()
if not os.path.exists('in/'):
os.mkdir('in/')
logger.error("Place input files in in/")
sys.exit()
def begin_greenbox(logger):
"""
For all backgrounds loop through all input files into the out file
"""
for bg in glob.glob('background/*'):
continue
bg_name = bg.split('/')[-1].lower().strip('.jpg').strip('.png').strip('.jpeg')
for picture in glob.glob('in/*'):
continue
pic_name = picture.split('/')[-1].lower().strip('.JPG').strip('.png').strip('.jpeg')
output_file = 'out/' + bg_name + ' ' + pic_name + '.jpg'
one_pic = time.time()
GreenScreen(infile=picture ,inbg=bg, outfile=output_file)
one_pic_time_done = time.time()
time_arr.append(one_pic_time_done-one_pic)
logger.info(time_arr)
logger.info('done : %s' % pic_name)
def start_logging():
logging.basicConfig()
logger = logging.getLogger('greenbox')
logger.setLevel(logging.INFO)
return logger
if __name__ == '__main__':
time_start = time.time()
time_arr = []
logger = start_logging()
logger.info("Start time: %s" % time_start)
check_folders(logger)
begin_greenbox(logger)
time_end = time.time()
logger.info("End time: %s" % time_end)
Everything is okay and the image is saved in the /out folder. Using this code:
from cv2 import *
# initialize the camera
cam = VideoCapture(0) # 0 -> index of camera
s, img = cam.read()
if s: # frame captured without any errors
namedWindow("cam-test",WINDOW_AUTOSIZE)
imwrite('/home/leonardo/Scrivania/in/KVfnt.png',img) #save image
Everything is okay and the image is captured from the camera and saved in /in folder. If I add the second code to the first one:
from PIL import Image
from PIL.ImageChops import subtract
import numpy, math, time, glob, sys, os, logging, requests, random
from cv2 import *
# initialize the camera
cam = VideoCapture(0) # 0 -> index of camera
s, img = cam.read()
if s: # frame captured without any errors
namedWindow("cam-test",WINDOW_AUTOSIZE)
imwrite('/home/leonardo/Scrivania/in/KVfnt.png',img) #save image
def GreenScreen(infile, inbg ,outfile='output.png', keyColor=None,tolerance=None):
"""
http://gc-films.com/chromakey.html
http://www.cs.utah.edu/~michael/chroma/
:param infile: Greenscreen image location
:param inbg: Background image location
:param outfile: Output file location
:param keyColor: greenscreen color; it can be any singular color
:param tolerance: tolerance of cleaning
:return:
"""
if not keyColor:
keyColor = [151,44,21] #Y,Cb, and Cr values of the greenscreen
if not tolerance:
tolerance = [100,130] #Allowed Distance from Values
#open files
inDataFG = Image.open('/home/leonardo/Scrivania/in/KVfnt.png').convert('YCbCr')
Path = '/home/leonardo/Scrivania/background/'
FullPath = os.path.join(Path, random.choice(os.listdir(Path)))
BG = Image.open(FullPath).convert('RGB')
[Y_key, Cb_key, Cr_key] = keyColor
[tola, tolb]= tolerance
(x,y) = inDataFG.size #get dimensions
foreground = numpy.array(inDataFG.getdata()) #make array from image
maskgen = numpy.vectorize(colorclose) #vectorize masking function
alphaMask = maskgen(foreground[:,1],foreground[:,2] ,Cb_key, Cr_key, tola, tolb) #generate mask
alphaMask.shape = (y,x) #make mask dimensions of original image
imMask = Image.fromarray(numpy.uint8(alphaMask))#convert array to image
invertMask = Image.fromarray(numpy.uint8(255-255*(alphaMask/255))) #create inverted mask with extremes
#create images for color mask
colorMask = Image.new('RGB',(x,y),tuple([0,0,0]))
allgreen = Image.new('YCbCr',(x,y),tuple(keyColor))
colorMask.paste(allgreen,invertMask) #make color mask green in green values on image
inDataFG = inDataFG.convert('RGB') #convert input image to RGB for ease of working with
cleaned = subtract(inDataFG,colorMask) #subtract greens from input
BG.paste(cleaned,imMask)#paste masked foreground over background
# BG.show() #display cleaned image
BG.save(outfile, "JPEG") #save cleaned image
def colorclose(Cb_p,Cr_p, Cb_key, Cr_key, tola, tolb):
temp = math.sqrt((Cb_key-Cb_p)**2+(Cr_key-Cr_p)**2)
if temp < tola:
z = 0.0
elif temp < tolb:
z = ((temp-tola)/(tolb-tola))
else:
z = 1.0
return 255.0*z
def check_folders(logger):
if not os.path.exists('out/'):
os.mkdir('out/')
if not os.path.exists('background/'):
os.mkdir('background/')
logger.error("Place background images in background/")
sys.exit()
if not os.path.exists('in/'):
os.mkdir('in/')
logger.error("Place input files in in/")
sys.exit()
def begin_greenbox(logger):
"""
For all backgrounds loop through all input files into the out file
"""
for bg in glob.glob('background/*'):
continue
bg_name = bg.split('/')[-1].lower().strip('.jpg').strip('.png').strip('.jpeg')
for picture in glob.glob('in/*'):
continue
pic_name = picture.split('/')[-1].lower().strip('.JPG').strip('.png').strip('.jpeg')
output_file = 'out/' + bg_name + ' ' + pic_name + '.jpg'
one_pic = time.time()
GreenScreen(infile=picture ,inbg=bg, outfile=output_file)
one_pic_time_done = time.time()
time_arr.append(one_pic_time_done-one_pic)
logger.info(time_arr)
logger.info('done : %s' % pic_name)
def start_logging():
logging.basicConfig()
logger = logging.getLogger('greenbox')
logger.setLevel(logging.INFO)
return logger
if __name__ == '__main__':
time_start = time.time()
time_arr = []
logger = start_logging()
logger.info("Start time: %s" % time_start)
check_folders(logger)
begin_greenbox(logger)
time_end = time.time()
logger.info("End time: %s" % time_end)
I obtain this error:
File "chromakey+upload.py", line 116, in <module>
begin_greenbox(logger)
File "chromakey+upload.py", line 97, in begin_greenbox
GreenScreen(infile=picture ,inbg=bg, outfile=output_file)
File "chromakey+upload.py", line 56, in GreenScreen
cleaned = subtract(inDataFG,colorMask) #subtract greens from input
TypeError: src1 is not a numpy array, neither a scalar
What is the problem? Thank you for your answers.
As the error says:
src1 is not a numpy array, neither a scalar
Perhaps, you should try:
cleaned = subtract(numpy.array(inDataFG.getdata()),numpy.array(colorMask.getdata()))
Edit
There is a 'conflict' on subtract:
from PIL.ImageChops import subtract # first subtract
from cv2 import * # OpenCV has a subtract too
This is one of the reasons to use namespaces on your calls.
If your main image lib is PIL, maybe you should do import cv2 and use cv2.* when needed.

Resources