Trying to parse pdf with russian text, but can't make it work
from PyPDF2 import PdfReader
reader = PdfReader("./mpp/1.pdf")
page = reader.pages[0]
m = page.extract_text()
print(m)
File pdf: https://disk.yandex.ru/i/qSJfFZJFDuLDIA
i'm trying use .encode, but it's not help.
In total, i need a this:
route1 = {
"bus": "Yutong г/н 499",
"stations": "12"
}
stations = ['1; ул. Арсеньева, м-н Пчелка; 7:40','2; ул. Агеева, ДК "Юность"; 7:44', '3; ул. Горького, минирынок Исток; 7:55', '....etc up to 12']
I made a GUI Application which looks like this:
The ones marked red are Tkinter Text widgets and the ones marked yellow are Tkinter Entry widgets
After taking user input, the data is to be added to a PSD file and then rendered as an image. But Lets say, after taking the following data as input:
It renders the following Photoshop file:
How do I fix this issue that it does not recognize "\n" properly and hence the rendered document is rendered useless.
Here is the code which deals with converting of the accepted user data into strings and then adding it to Photoshop template and then rendering it:
def DataAdder2CSV():
global edate, eSNO, eage, egender, ename, ePID, econtact, ecomp, eallergy, ehistory, eR
e=edate.get()
a=eSNO.get()
d=eage.get()
f=egender.get()
b=ename.get()
c=ePID.get()
g=econtact.get()
h=ecomp.get(1.0,END)
i=eallergy.get(1.0,END)
j=ehistory.get(1.0,END)
k=eR.get(1.0,END)
data=[a,b,c,d,e,f,g,h,i,j,k]
file=open("Patient_Data.csv","a", newline="")
writer=csv.writer(file, delimiter=",")
writer.writerow(data)
file.close()
messagebox.showinfo("Prescription Generator", "Data has been saved to the database successfully!")
import win32com.client, os
objShell = win32com.client.Dispatch("WScript.Shell")
UserDocs = objShell.SpecialFolders("MyDocuments")
from tkinter import filedialog
ExpDir=filedialog.askdirectory(initialdir=UserDocs, title="Choose Destination Folder")
psApp = win32com.client.Dispatch("Photoshop.Application")
psApp.Open("D:\Coding\Python Scripts\Dr Nikhil Prescription App\Prescription Generator\Presc_Template.psd")
doc = psApp.Application.ActiveDocument
lf1 = doc.ArtLayers["name"]
tol1 = lf1.TextItem
tol1.contents = b
lf2 = doc.ArtLayers["age"]
tol2 = lf2.TextItem
tol2.contents = d
lf3 = doc.ArtLayers["gender"]
tol3 = lf3.TextItem
tol3.contents = f
lf4 = doc.ArtLayers["pid"]
tol4 = lf4.TextItem
tol4.contents = c
lf4 = doc.ArtLayers["date"]
tol4 = lf4.TextItem
tol4.contents = e
lf5 = doc.ArtLayers["contact"]
tol5 = lf5.TextItem
tol5.contents = g
lf6 = doc.ArtLayers["complaint"]
tol6 = lf6.TextItem
varH=" "+h.rstrip("\n")
tol6.contents =varH
lf7 = doc.ArtLayers["allergy"]
tol7 = lf7.TextItem
tol7.contents = i.rstrip("\n")
lf8 = doc.ArtLayers["history"]
tol8 = lf8.TextItem
varJ=" "+j.rstrip("\n")
tol8.contents =varJ
lf9 = doc.ArtLayers["R"]
tol9 = lf9.TextItem
tol9.contents = k.rstrip("\n")
options = win32com.client.Dispatch('Photoshop.ExportOptionsSaveForWeb')
options.Format = 13
options.PNG8 = False
pngfile =ExpDir+f"/{c}-{b}_({e}).png"
doc.Export(ExportIn=pngfile, ExportAs=2, Options=options)
messagebox.showinfo("Prescription Generator", "Prescription has been saved in the desired location successfully!")
There are 3 ways of expressing new line characters:
MacOS uses \r
Linux uses \n
Windows uses \r\n
Python and tkinter use \n but it looks like psApp.Application uses \r instead. That is why the document isn't rendered properly. For more info read the answers to this question.
I have an xml like this:
<library>
<content content-id="title001">
<content-links>
<content-link content-id="Number1" />
<content-link content-id="Number2" />
</content-links>
</content>
<content content-id="title002">
<content-links>
<content-link content-id="Number3" />
</content-links>
</content>
<content content-id="Number1">
<content-links>
<content-link content-id="Number1b" />
</content-links>
</content
</library>
I would need to get all the content-id that are linked to specific content-id titles. For example, for this case I would need all the ids that are linked for title001 (I might need for more titles, so it would be a list of titles that need to be found). And all these ids be added to a list that would look like:
[title001, Number1, Number2, Number1b]
So I guess that I need to recursively check every content and then get the content-id from the content-link to go to the next content and check in this one all the content-link going to the next one until the xml is completely read.
I am not able to find the recursive solution to this.
Adding the code that I got until now for this:
from lxml import etree as et
def get_ids(content):
"""
"""
content_links = content.findall('content-links/content-link')
print(content_links)
if content_links:
for content_link in content_links:
print(content_link,content_link.get('content-id'))
cl = content_link.get('content-id')
cont = x.find(f'content[#id="{cl}"]')
if cont is not None:
get_ids(cont)
if __name__ == '__main__':
"""
"""
x = et.fromstring(xml)
ids = ['title001']
for id in ids:
content = x.find(f'content[#id="{content-id}"]')
get_ids(content)
Try the following code:
from lxml import etree as et
parser = et.XMLParser(remove_blank_text=True)
tree = et.parse('Input.xml', parser)
root = tree.getroot()
cidList = ['title001'] # Your source list
cidDct = { x: 0 for x in cidList }
for elem in root.iter('content'):
cid = elem.attrib.get('content-id', '')
# print(f'X {elem.tag:15} cid:{cid}')
if cid in cidDct.keys():
# print(f'** Found: {cid}')
for elem2 in elem.iter():
if elem2 is not elem:
cid2 = elem2.attrib.get('content-id', '')
# print(f'XX {elem2.tag:15} cid:{cid2}')
if len(cid2) > 0:
# print(f'** Add: {cid2}')
cidDct[cid2] = 0
For the test you may uncomment printouts above.
Now when you print list(cidDct.keys()), you will get the
wanted ids:
['title001', 'Number1', 'Number2', 'Number1b']
I want retrieve the names from the table and store them in a list. https://www.in.pampers.com/pregnancy/baby-names/article/top-indian-baby-names
However, I am not able to get the text. It returns
'NoneType' object has not attribute 'text'
https://www.in.pampers.com/pregnancy/baby-names/article/top-indian-baby-names
Also, I don't want the anchor tag to be included in the list of names. I just want the text of the anchor tag.
from bs4 import BeautifulSoup
import requests
import lxml
web_page = requests.get("https://www.in.pampers.com/pregnancy/baby-names/article/top-indian-baby-names")
# if requests.status_code == 200:
bs = BeautifulSoup(web_page.text, 'lxml')
tables = bs.findAll("table")
for table in tables:
for row in table.findAll("tr"):
print(row.find('td').text)
import pandas as pd
df = pd.read_html(
"https://www.in.pampers.com/pregnancy/baby-names/article/top-indian-baby-names")
names = []
for item in df:
names.extend(item['Name'].to_list())
print(names)
Output:
['Aaradhya', 'Adah', 'Adhira', 'Alisha', 'Amoli', 'Anaisha', 'Ananya', 'Anika', 'Anushka', 'Asmee', 'Avni', 'Carina', 'Drishti', 'Hiya', 'Ira', 'Ishana', 'Ishita', 'Kaia', 'Kashvi', 'Keya', 'Kimaya', 'Krisha', 'Larisa', 'Mahika', 'Mayra', 'Mehar', 'Mirai', 'Mishka', 'Naitee', 'Navya', 'Nehrika', 'Neysa', 'Pavati', 'Prisha', 'Ryka', 'Rebecca', 'Saanvi', 'Sahana', 'Sai', 'Saisha', 'Saloni', 'Shanaya', 'Shrishti', 'Sneha', 'Taahira', 'Taara', 'Tanvi', 'Viti', 'Zara', 'Aahva', 'Aadiv', 'Aarav', 'Akanksh', 'Alex', 'Anant', 'Atiksh', 'Ayaan', 'Bhuv', 'Dasya', 'Gian', 'Hem', 'Idhant', 'Ishank', 'Jash', 'Jay', 'Joseph', 'Kabir', 'Kahaan', 'Kairav', 'Kevin', 'Laksh', 'Luv', 'Manan', 'Mohammad', 'Naksh', 'Nimit', 'Nirav', 'Pahal', 'Parv', 'Pranay', 'Rachit', 'Raj', 'Ranbir', 'Raunak', 'Reyansh', 'Rishaan', 'Rishit', 'Rohan', 'Rudra',
'Rushil', 'Sadhil', 'Sarthak', 'Taarush', 'Taksh', 'Ved', 'Vihaan', 'Vivaan', 'Yash', 'Yug', 'Zuber']
content = driver.find_element_by_class_name('topics-sec-block')
container = content.find_elements_by_xpath('//div[#class="col-sm-7 topics-sec-item-cont"]')
the code is below:
for i in range(0, 40):
title = []
url = []
heading=container[i].find_element_by_xpath('//div[#class="col-sm-7 topics-sec-item-cont"]/a/h2').text
link = container[i].find_element_by_xpath('//div[#class="col-sm-7 topics-sec-item-cont"]/a')
title.append(heading)
url.append(link.get_attribute('href'))
print(title)
print(url)
it is giving me the 40 number of lines but all lines have same title and url as (some of them is given below):
['Stuck in Mexico: Central American asylum seekers in limbo']
['https://www.aljazeera.com/news/2020/03/stuck-mexico-central-american-asylum-seekers-limbo-200305103910955.html']
['Stuck in Mexico: Central American asylum seekers in limbo']
['https://www.aljazeera.com/news/2020/03/stuck-mexico-central-american-asylum-seekers-limbo-200305103910955.html']