Trying extract a geography coordinates from .pdf file with python3 - python-3.x

I am trying to extract a geographic coordinates in UTM format from a .pdf file with python3 in Ubuntu operative system, with the follow code:
from pathlib import Path
import textract
import numpy as np
import re
import os
import pdfminer
def main(_file):
try:
text = textract.process(_file, method="pdfminer")
except textract.exceptions.ShellError as ex:
print(ex)
return
with open("%s.csv" % Path(_file).name[: -len(Path(_file).suffix)],
"w+") as _file:
# find orders and DNIs
coords = re.compile(r"\d?\.?\d+\.+\d+\,\d{2}")
results = re.findall(coords, text.decode())
if results:
_file.write("|".join(results))
if __name__ == "__main__":
_file = "/home/cristian33/python_proj/folder1/buscarco.pdf"
main(_file)
when I run it give me the follow error:
The command pdf2txt.py /home/cristian33/python_proj/folder1/buscarco.pdf failed because the executable
pdf2txt.py is not installed on your system. Please make
sure the appropriate dependencies are installed before using
textract:
http://textract.readthedocs.org/en/latest/installation.html
somebody knows why is that error?
thanks

Related

How to extract only CR No only from image

Sample image
I need to extract CR No.from the sample image given above. Using Easyocr, I got the output in complex nested list form. How to update the code to filter out all the detected text/numbers and get only CR No. I am running out of ideas, and help will be appreciated. What I have tried so far-
#Import libraries
import os
import easyocr
import cv2
from matplotlib import pyplot as plt
import numpy as np
IMAGE_PATH = 'H://CDAC//Spyder_projects//CR_No//input_image//input7.jpg'
reader = easyocr.Reader(['en'])
result3 = reader.readtext(IMAGE_PATH)
result3
my_list2 = []
length = len(result3)
for i in range(length):
if (result3[i][1]) == 'CR No':
print(result3[i])
print(result3[i+1])
my_list2.append(result3[i+1]+result3[i])
print(my_list2)
print('The CR No is:', my_list2[0][1])
The expected output should be- 211022203481161

Argument error when compiling .exe from Python using PyInstaller

I am trying to write a screen recorder program in python. My code runs normally in the compiler. But when I convert it to .exe, it raises this error:
[ERROR:0] global C:\projects\opencv-python\opencv\modules\videoio\src\cap.cpp (415) cv::VideoWriter::open VIDEOIO(CV_IMAGES): raised OpenCV exception:OpenCV(4.2.0) C:\projects\opencv-python\opencv\modules\videoio\src\cap_images.cpp:253: error: (-5:Bad argument) CAP_IMAGES: can't find starting number (in the name of file): project.avi in function 'cv::icvExtractPattern'
I used pyinstaller to convert to .exe.
This is my code:
from tkinter import*
from tkinter import messagebox as msj
from PIL import ImageTk, Image
from PIL import ImageGrab
import os
import time
import cv2
import numpy as np
import glob
recording=False
i = 0
size = 100, 100
mainWindow=Tk()
mainWindow.title("ScreenRecorder")
mainWindow.geometry("200x200")
scriptDirectory = (os.path.dirname(os.path.realpath(__file__)))
def convert(imageCount):
img_array = []
for ip in range(1,imageCount):
x="snap"+str(ip)+".jpg"
for filename in glob.glob(x):
img = cv2.imread(filename)
height, width, layers = img.shape
size = (width,height)
img_array.append(img)
out = cv2.VideoWriter('project.avi',cv2.VideoWriter_fourcc(*'DIVX'), 9, size)
for iz in range(len(img_array)):
out.write(img_array[iz])
out.release()
for a in range(1,imageCount+1):
os.remove("snap"+str(a)+".jpg")
def record():
global i
print(recording)
if(recording==True):
i+=1
fileName= ("snap"+str(i))
#time.sleep(0.00005)
image = ImageGrab.grab()
name=fileName+".jpg"
image.save(name,'JPEG')
imgX = (Image.open("snap"+str(i)+".jpg"))
imgX= imgX.resize(size, Image.ANTIALIAS)
imgX=ImageTk.PhotoImage(imgX)
mainWindow.after(1, record)
def startButton():
global recording
print("ehe")
recording=True
record()
def stopButton():
global recording
recording=False
record()
convert(i)
startButton=Button(text="Start",command=startButton)
startButton.pack()
stopButton=Button(text="Stop",command=stopButton)
stopButton.pack()
mainWindow.after(1, record)
mainWindow.mainloop()
I can just advise you to use another method, i think it's more simple, try to use 'auto py to exe'.This is a module that can be installed from the net or from the pip installer.
see from here.This the only way that i use for my codes.
Secondly, iknow that if the program is not opened using the format of .py will never be opened at .exe
hope i helped you.

pytesseract unable to recognize complex math formula from image

I am using pytesseract module in python, pytesseract recognizes text from image but it dosen't work on images that contain complex math formulas like under-root, derivation, integration math problem or equation.
code 2.py
# Import modules
from PIL import Image
import pytesseract
import cv2
# Include tesseract executable in your path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Create an image object of PIL library
image = Image.open('23.jpg')
# img = cv2.imread('123.jpg')
# pass image into pytesseract module
# pytesseract is trained in many languages
image_to_text = pytesseract.image_to_string(image, lang='eng+equ')
image_to_text1 = pytesseract.image_to_string(image)
# Print the text
print(image_to_text)
# print(image_to_text1)
# workon digits
Output:
242/33
2x
2x+3X
2X+3x=4
2x?-3x +1=0
(x-1)(x+1) =x2-1
(x+2)/((x+3)(x-4))
7-4=3
V(x/2) =3
2xx—343=6x—3 (x#3)
Jeeta =e* +e
dy 2
S=2?-3
dz ¥
dy = (a? — 3)dx
Input image
To work with MATH language you should install the proper language for tesseract. In your case it is 'equ' from https://github.com/tesseract-ocr/tessdata/raw/3.04.00/equ.traineddata . The full list of available languages is described at https://tesseract-ocr.github.io/tessdoc/Data-Files
I'm not familiar with tesseract language install for windows. But there is a documentation at https://github.com/tesseract-ocr/tesseract/wiki :
If you want to use another language, download the appropriate training
data, unpack it using 7-zip, and copy the .traineddata file into the
'tessdata' directory, probably C:\Program Files\Tesseract-OCR\tessdata
And at first try to process your image with cli only ( without pyhton ), because cli has a full list of the options to tune.
I used this Code and it worked!
import re
import cv2
import pytesseract as tess
path = (r"C:\Users\10\AppData\Local\Tesseract-OCR\tesseract.exe")
tess.pytesseract.tesseract_cmd = path
png = "Downloads/m.png"
text = tess.image_to_string(png)
text.replace(" ", "")
pattern = re.compile("([0-9][=+-/*])")
equations = [x for x in text if bool(re.match(pattern, x))]
print(re.findall(r'(.*)', str(text))[0])

Using Geopandas and the county-choropleth module

I am unable to get figure_factory to recognize the county_choroplet module which contains create_choropleth (on line 512 I believe).
I am just using a basic example from the plotly website
https://plot.ly/python/county-choropleth/
Edit: Ive tried to implement suggestions from a previous question by importing as:
from plotly.figure_factory._county_choropleth import create_choropleth
and then:
fig = create_choropleth(fips=fips, values=values)
py.ploy(fig, filename='basic-choropleth')
py.iplot(fig, filename='choropleth of some cali counties - full usa scope')
But i receive the following error (in picture):
File "C:\ProgramData\Miniconda3\lib\site-packages\fiona__init__.py", line 162, in open
raise IOError("no such file or directory: %r" % path)
OSError: no such file or directory: 'C:\ProgramData\Miniconda3\lib\site-packages\plotly\package_data\gz_2010_us_050_00_500k.shp'
So what is did was transfer the files in C:\ProgramData\Miniconda3\pkgs\plotly-3.1.1-py36h28b3542_0\Lib\site-packages\plotly
to:
C:\ProgramData\Miniconda3\Lib\site-packages\plotly
Then I ran the code:
import plotly.plotly as py
from plotly.figure_factory._county_choropleth import create_choropleth
py.sign_in('chessybo', 'XXXXXXXXXXX')
fips = ['06021', '06023', '06027',
'06029', '06033', '06059',
'06047', '06049', '06051',
'06055', '06061']
values = range(len(fips))
#fig = ff.create_choropleth(fips=fips, values=values)
fig = create_choropleth(fips=fips, values=values)
#py.plotly(fig, filename='basic-choropleth')
py.plot(fig, filename='choropleth of some cali counties - full usa scope')
and it worked.
What you will get after executing this code:
# import necessary libraries
import geopandas
import shapely
import shapefile
import plotly
from plotly.figure_factory._county_choropleth import create_choropleth
# Check your plotly version
print(plotly.__version__, geopandas.__version__,shapely.__version__,shapefile.__version__)
# Data
fips = ['06021','06023','06027',
'06029','06033','06059',
'06047','06049','06051',
'06055','06061']
values = range(len(fips))
# Create fig
fig = create_choropleth(fips=fips, values=values)
# Plot in offline mode and save plot in your Python script folder
plotly.offline.plot(fig, filename='choropleth_usa.html')
Just in my case, script return the following:

Why is my image_path undefined when using export_graphviz? - Python 3

I'm trying to run this machine learning tree algorithm code in IPython:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
iris = load_iris()
X = iris.data[:, 2:] # petal length and width
y = iris.target
tree_clf = DecisionTreeClassifier(max_depth=2)
tree_clf.fit(X, y)
from sklearn.tree import export_graphviz
export_graphviz(tree_clf, out_file=image_path("iris_tree.dot"),
feature_names=iris.feature_names[2:],
class_names=iris.target_names,
rounded=True,
filled=True
)
But I get this error when run in IPython:
I'm unfamiliar with export_graphviz, does anyone have any idea how to correct this?
I guess you are following "Hands on Machine Learning with Scikit-Learn and TensorFlow" book by Aurelien Geron. I encountered with the same problem while trying out "Decision Trees" chapter. You can always refer to his GitHub notebooks . For your code, you may refer "decision tree" notebook.
Below I paste the code from notebook. Please do go ahead and have a look at the notebook also.
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals
# Common imports
import numpy as np
import os
# to make this notebook's output stable across runs
np.random.seed(42)
# To plot pretty figures
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "decision_trees"
def image_path(fig_id):
return os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id)
def save_fig(fig_id, tight_layout=True):
print("Saving figure", fig_id)
if tight_layout:
plt.tight_layout()
plt.savefig(image_path(fig_id) + ".png", format='png', dpi=300)
To get rid of all the mess simply remove image_path,
now out_file="iris_tree.dot", after running that command a file will be saved in your folder named iris_tree. Open that file in Microsoft Word and copy all of its content. Now open your browser and type "webgraphviz" and then click on the first link. Then delete whatever is written in white space and paste your code which is copied from iris_tree. Then click "generate graph". Scroll down and your graph is ready.
I know you might have got what you were looking for. But in case you don't, all you need to do is just replace:
out_file=image_path("iris_tree.dot")
with:
out_file="iris_tree.dot"
This will create the .dot file in the same directory in which your current script is.
You can also give the absolute path to where you want to save the .dot file as:
out_file="/home/cipher/iris_tree.dot"
you must correct
out_file=image_path("iris_tree.dot"),
in below code line:
out_file="C:/Users/VIDA/Desktop/python/iris_tree.dot",
You can directly type instead of using the webgraphviz, if you are using sklearn version 0.20.
import graphviz
with open ("iris_tree.dot") as f:
dot_graph = f.read()
display (graphviz.Source(dot_graph))
With sklearn 0.22 you have to change again. See sklearn users guide.
I have a sklearn with the version of 0.20.1, and I got the example to work through the line below.
export_graphviz(
tree_clf,
out_file = "iris_tree.dot",
feature_names = iris.feature_names[2:])

Resources