Coverting epub to html file using pypandoc - python-3.x

import pypandoc
html=pypandoc.convert_file("corona.epub",'html',outputfile="corona1.html")
assert html==""
I'm tried to convert an epub file into HTML file but the output is only text HTML. there is no image in this file. how to convert HTML with image?

you could use:
$ unzip "corona.epub"
from your command line, and it will expand all the xhtml files inside the epub

import pypandoc
import os
os.environ.setdefault('PYPANDOC_PANDOC', '/opt/homebrew/bin/pandoc')
filepath = "<full path of your epub file...>"
pathname, filename = os.path.split(filepath)
targetfilename = 'index.html'
pypandoc.convert_file(filepath,
format='epub',
to='html5',
extra_args=[
'--read=epub',
f'--extract-media={pathname}',
'--wrap=none'
],
encoding='utf-8',
outputfile=pathname + '/' + targetfilename,
filters=None,
verify_format=True
)

Related

How to create and add files to a directory?

I'm writing a program to take large PDF's and convert each page to a .jpg, then add the .jpg's of each pdf file to their own directory (which the program needs to create).
I have completed the conversion part of the program, but I am stuck on creating a directory and adding the files to the directory.
Here's my code so far.
import glob, sys, fitz, os, shutil
zoom_x = 2.0
zoom_y = 2.0
mat = fitz.Matrix(zoom_x, zoom_y) # to get better resolution
all_files = glob.glob('/Users/homefolder/Downloads/*.pdf') # image path
print(all_files)
for filename in all_files:
doc = fitz.open(filename)
head, tail = os.path.split(doc.name)
save_file_name = tail.split('.')[0]
for page in doc: # iterate through the pages
# print(page)
pix = page.get_pixmap(matrix=mat)
# render the image
filepath_save = '/Users/homefolder/Downloads/files' + save_file_name + str(page.number) + '.jpg'
pix.save(filepath_save) # save image
sample = glob.glob('/Users/homefolder/Downloads/*.jpg')
How would I write the code to create a directory for each pdf file and add those .jpg's to the directory?
You can create directory and save to it your processed files, I also refactored your code a bit:
import glob, fitz, os
zoom_x = 2.0
zoom_y = 2.0
mat = fitz.Matrix(zoom_x, zoom_y)
pdf_files = glob.glob('/Users/homefolder/Downloads/*.pdf')
save_to = '/Users/homefolder/Downloads/pdf_as_img/'
for path in pdf_files:
doc = fitz.open(path)
base_name, _ = os.path.splitext(os.path.basename(doc.name))
directory_to_save = os.path.join(save_to, base_name)
if not os.path.exists(directory_to_save):
os.makedirs(directory_to_save)
for page in doc:
pix = page.get_pixmap(matrix=mat)
filepath_save = os.path.join(directory_to_save, str(page.number) + '.jpg')
pix.save(filepath_save)
This script creates a directory for every pdf file and saves pages as jpg to it.

Python: Insert the base64 image into SVG file

The context is I have created a .svg file using graphviz for a flow chart.
I want to insert the other logo picture(can be .png or other format) into the .svg file and then distribute it.
I have the base64 code for this logo. However, I don't know how to insert it with the .svg file.
I tried the solution as https://blog.idrsolutions.com/2020/12/how-to-embed-base64-images-in-svg, and it doesnt' work.
The step I followed(Use google logo as an example):
Step 1:Get the base64 encoding - Successful
import base64
img = r"C:\data\cwd\googlelogo_color_92x30dp.png"
with open(img, 'rb') as image:
image_read = image.read()
image_64_encode = base64.b64encode(image_read)
Step 2: Copy the above base 64 string(without the b') as text and edit the .svg file using text editor i.e. notepad++
Below one works fine on my PC:
<image xlink:href="c:\data\cwd\googlelogo_color_92x30dp.png" width="160px" height="22px" preserveAspectRatio="xMinYMin meet" x="121.5" y="-480"/>
replace it to:
<image href="data:image/png;charset=utf-8;base64,iVBORw0KGgoAAAANSUhEUgAAALgAAAA8CAYAAADVEnAJAAAOvklEQVR42u1dCZBcRRluyM4Sbg9QuUTFYAhy7Zs3G2Nw5r2ZTWKMWBCXQ5QzIncEFIqjGGtnZpdwaEUOIYcFlBwVRBA5wh7hUIIQCFgkJCAWBUWSnZ3N9d7MXgk7/p/sZje72/3umR2rv6quDOyb1zXd3/v77+///37MD0STxapYY6FWT+ev0zP5h7W0sTqWyWdjGaNA/13UU0Yf/ZvT08bbWjr/GH2+Qc8YJyv3FUNMQmK8Qk8XIrGM+Qc9ld8CIjttn33PvC/elD+JSUiMFxAp64igL4GkvrW08TytADVMouIQfnRucazGKg11qe6v62nzryBkIC1l9NO/d0eT2f2YhCR4ia32T2MpMw8iBt20TP79eIN5HJOQBA8a9cuKE8h1uAfEK3EzY+n8TCYhCR4QoI5MJMv9pF1SxlLGTrLyK+nzQvreZfGUeYbWWDhFy5inU7uCFJS7Ymnjdbgidu6Ha/GAMQlJ8ADIXaWlzSdsqiHv6JnCL6Y3bfu8LV8+aX6JHoD59L1/Czadb8cbt3+RSUiCBwHId5YWNmN8AuucTBb3dPsQkfszDxr57vc23pLklgQPDES68yw3gqn8I7OSnQcw+xBb9LTxDO6LIJEktyR4sFJgyuiyUDluYsXiHsxHYBUgCfKXM5LbvsAkJMGDAEhL7sFyC+t9A5OQqESCQ5YTk9tcymC5JSQqjeAgLiQ+AcHXz0kW92ESEpVIcGQDihUTM84kJCqV4JAFBZvKFiYhUakER7SQ8rc38wgeb8zPYhWOjkTtpGxcvSQbDy+m1tauK6vQ8JnaEvytI64ezQJCT3P1pL7mqkt2tExY3NsSatvRGlqF9r/PzROW4G89LdWB9Z+4ZcuBlN58KgXvMhRge4hW5KeQm09y7+0UMZ47bUFu/3ITvKgooQ5N/V5WV69r19WlHbr6JM3LUzQvD3bo4Ztpvuren/XNvZhTaE0Fhe+eGBsrNVyOAcvGI+eAyDRIRTutPa6+SQN8Hr7ruf83WGhnS+icATIX7TQi+5u9raHz8F2fYho1tAI/SkG5XougXYGu/f30jHHwZ98z6xFNHtlQ0OI3wbNR9Ssd8fACGv+c9fyEt+DaDVHlIGYXlDNyNd89MRezCkQ2oUzPxpW1GBQ3LauF34U1YS7R11I1nci6FqR11ya829dW5bp/BMvIOD3gNMktlsp3YsUma38xJ4XiDb8IXkyyPTviylU0znmn8wOi5+Lq2fYseNpcwiV4On9ORVltxvYgl+P6dk39lDc4Dqx5Py2TN+Ke9sUotgeR+3qywp+CqF5ab3Oov6+16kbc0+mKrGWMj8VkFifNYd8VJMGz0Sn7tevhp73OEc317XhQmAj01L7M+7FY4iqJ3DRoC/HDfW26ejfubYfcRMyFIKfP7W67JEetK1KN/UhZDorgm+qO35es9t/9mh/MOeZHRPC1vB/kNS8kiEGmDfGdbAzA2uIHB9RuYhaAtQUhg2i0Klj2rzWYx9LYbMcYjVeCg4g0lo/5PT8QCRgPouWsPlmsHm8ER9kcG4FcXInBpRBbYuUt2p1fDLUEu3G03AxlclZXLidfcI2lu5IIRxkHpI7E4FJY+NVv0UNwMdSS/mfZXmg9y6sn0/+7nPz1NZbuSnMVt38E4UgdWWdhGDYTQW9DTS1yjqK3FA5PpIzptM/6DSkpG0pB8A49cqF476P20ib/PrruZLgxeCA2zjrx4Jwe/iEUFe53NbUHc/l/QXC4VLtZhfop1UTA9fyBU7rIus/DYIk3POqVGGD+UqiuG0tdKS5j1UTQ9QJydpFPPq9YFPdPVvpKuq5X8ICs46krRNpGYYJc2rwX2Z+iB4QegN8FSfCt0RM+l9WUzSIFC3IuE4B87rm8OYLkizmucBcFzVjFhoEUj/MFT3YhW6d8l9kEtFYRySE7shHobQudzyMlEbbQ93yV7f6JxHUikkN2ZCNwcjp/CG0MewRjdq13Rc07wbH558+TsjI3bZotLR6GSKCuJLxvMsvvovyTDQNcD94PJinpJ8wh4MYIBnDUBMP14BKyOeS4f7gxXCveEhrVP7kYacFY/dFpThK0bl8JPiQJfsix3FtziZMOtRUI0iP1NAcvCx6U50Z9EVp3UDJhMMdL5FvZAOB3Cfy5F/huiVifFQWHslrNUYPXwocWWO8XikV3/YuCQ91tex21e5Kc8RHvYCV7ZYSjVwRaJbv9JHhOq1UEKtWNTACQnyKZSZJ+N1htNiEPI3A0kuBXCXy3JaxMgFrCmbiHdi17evgyrvXWwqcwlyBLcBZ3IBPqz4dZ28u41rat2nX/O9tCZwl8+l39JxqMYwTW9jbmElrauN9PgtO4Xc0dz5k1h4yltkA4oKjlsva4ssOumgL5sTNRe8yocK5gkDahdrIsBE/lV3Aeuswwv24xZ2PZbZW3YBmI0JSdPF18mHqymEPCbqgkrjX9F9h+tALs5OnidsoL441d33FN8Ixxmq8E18L3c9yT9WwY4IfTnF4KVcsBqfOksNzbrtcezy0XQ4iWa8Ub87NZiYGDOXkBCy1lnjWM4G28HTnzCAyylZ+HZCleTgnzCJ50SMTf1T8edt4pYbMW9rt+wKJN3V/zkeCYpxWisdykqcfCcND/M+0SG6oWEfuKLQnlQGYFHKIpkOVWlCF9V+MXPG/b5YPCV+YQ/FnPBNfUFt6Of/Aa+MocC+65fyJyC8eC7+ofR91x3LgO5gGQh/0kOAwOL+eHiPqibVJjVdXVx3NaRIcb4+ikWPyA8ZIyi90/J0/io+Flc7SRfL3MBH+9PAQXGyYcX+31XBxfCa6pq71FK5V22m81dEYjh3soODZfEZD8g6Gc4WCBCBtP10VK5wiJsJWzm15dAhcFJGzlBGVWl8hFWcBLmPKyd0osKBzqJ8Fhpd0lvCkv0UbzDATz/DkWWXxcxJ9KUXQs0mGj6UJ4N99OVxbxQr4fT526N3MJbHY4m0y0O4dp4Is4JOztX8lc99//D7Y/d5PZEtrVP04U449V/kTmElixfd1k6uEHnGwa6fp72uM1xwVwbIT5tEVlfUPA5L5Q0PerbASQZCNIpZzLXAIBIkGu+PmD16EShx+UqXbdPwJEvPsicjp4HR54QZAuyVwCK2WQMiHPH0deUOesyAEsKGD3jOoOi0hiEx6GYI5oNnYKJuz7Y4RtjxaFf90GehCx5N1384zwEYPXInFKkCS10m2gBxFL3n37l088wl65obHRzUkIyFmh72/1VUWpU08UqCEbc3pEczVX0ahzN4xkwZ9Zp6yaT/l1ChV8RawM4mQh4znOQ8XdoaMhyYo5BHRYUSBhDF/5TUE0c56LaqBLBQ/NqP5xaq+PwR5Y79/6HqpHrr6mvMfL1OzUlYiLYNxs3JMs/zRmFzz5iSdF0eBe4GUzg+oT5JZY9LV1RqrrCD4h1XNF6ZftCTVuv9RNnUkD3ufE7aEEqHMFBO+lByDuYGM5kxSYPiduj9aw/VuiI6kRDGI2gcNUA0q2gh8+X+CifIC0WGYTIDU088HwPOo0P4xGJzo9+fVxm/khHyALDTkMzAYQgIil8z/AKmDjBNtPEWiyLi5W1opIjgy0Yn09t3gaf4OfKAoLw22B+zBWcTHqL0UkRyospdVy+8ff6Jqridw7RIlW6J/j3i21cCtTotRnuDp0j2sw3kERfMMcZR8a30944zswh0dbVW1hVUaketQ9oHrpNTUMsE3EjPkXh7na/6LBXITBQqIWjiRA5BEH4mO5pL83O3wVyoV2i4yH6jCFA3hlpx6ZgqcdDZ9hWRAVsyh46MPgiYqMreow8RCA6D2t1VMoFD8RDZ8pn2U+cr0tCh766P7c/lENj7QK8YprfohXPWpN+RPgmyNanGjYPglKDOatFAUPNM6nWigoPVCp2hORqZAGB0ndqU89jKz0BXBHxd8PZ/EgOTozhbOjDrRhs+lkaQWyeuTaoErWoNbYSHO9NqiSNag11tFoI4p3ko7nkjUAIXlbGjgMlqZ0QjZ0cP1stwrHmeRzG6UgN1QBaPKuKuq18B1+kxupmgywV3R8h+/kbq1KOvGhYRzGM8GhfJA1fsLPOcJGFRbe4+lIXV+lH/nnYAluLOdsKG2TPKer1wwFadw3+O65ePgi5gAgORHyGk6QhtNEvnvoIhcq2GxyObZ5Wz2N570TXLxvGsoE9dhQtaUrP/Y1EQoJWD4XMayDv+6Xvr4pVlOLSh8PVvvVrBY5gblE34qq2qFKH1ftVQrouO4/3tT9DRSGuKh33UZRzDm+H/wjDqjlPLiOrw3lf/sMRNEGihJyLi1FD97ihjexuX3PDyBURhLq6TQQrzhIwXyRdvo/GlIr3APKCEUkT9/RHHrFAbFfJCnQl/5hLKBWoSTRptV+kIh92EA++CVjX2eu9PvoNqS74uxBUkE2OTlij75zJsapJId3Qs8m1eTXyFdBUTB29FBLoM/iXDxo5tipg9CoIcQyWso3Gm/UTjpyoDj5LmQagvRo9PkZ7NpRTIydOgsI/W0Tj0SInaS+u5BpCNKjUT75M8gtQTFxf+vehwX5ahpYZZQnwrKTOvYaXseOyh2oKNFb87uVeUEF47mQowj+yGlvjNXcRCUR0STXJY1Tr2gz+g65iR+hlhPJc6juoTn81aa68LeZhIQX8I6QwAPBJCTGA6BzM5eAdZfvaJIYd0BwjqLKZyPzEq9gd5cbZBxErsiOsQguX68uURagSATheFTyDN84ujnXBke5ceo8+1D0wiQkSgmUG4LMHNXjP2TNv+xAAj6edyaKljb/xiQkyvEKGlJG3hFo2+/h/BR7D0q+XVCXO4dJSJQD5HfHrGIOJOHeEc0Yk6GPD88apXjGVBzwJA7xG6vkO1Ilyv2mvFvtvqaE/l1DhH8f7oidIFC8saAyCYlyAtaYk97ssZnzmYRESSE+sOdhHwl+M5OQGG+vZYfVJVmvy0Py2xa8SpBJSJQb4vMFzaXIBXJAbAMvh0Wwh0lIVAJAVgr4zBtwXdbgRVWovUTgBrIgop44Ag7pynW39e/LJCQkKgP/BTwUobIIDirVAAAAAElFTkSuQmCC"/>
I have checked the output of step 1, starts with iVBORw0KGgoAAAAN and ends with VAAAAAElFTkSuQmCC, to make sure I have copied the full string.
Step 3: Save the .svg file, and open it using google chrome but no logo show up
Can you please advise where you see as incorrect?
Thanks
I think that:
image_64_encode = base64.b64encode(image_read)
Should perhaps be:
image_64_encode = base64.b64encode(image_read).decode('ascii')
Here's how I accomplished inserting a PNG into a SVG (using Python to do all the work).
Note: I created an SVG with a placeholder PNG, then edited the SVG file and replaced the existing xlink:href="..." data with xlink:href="[PNGREPLACE]". If you open it with Inkscape, the image will display as broken.
import base64
import io
def replace_png_inside_svg(svg_location, png_location):
prepend_str = "data:image/png;base64,"
encoded_str = base64.b64encode(open(png_location, "rb").read()).decode('ascii')
data_tuples = [
("[PNGREPLACE]", prepend_str + encoded_str)
]
# Make a copy of the "svg_location" file to preserve it as a template if needed...
changeSVGtextPlaceholders(svg_location, data_tuples)
# The SVG located at svg_location has now been updated with new information.
def changeSVGtextPlaceholders(svg_output_file, data_tuple):
character_encoding = "utf-8"
template_svg_file = io.open(svg_file, 'r', encoding=character_encoding)
template_svg_file_lines = template_svg_file.readlines()
new_content = ""
for line in template_svg_file_lines:
for check_str_tuple in tuple_lookup_and_replace_list:
check_str = check_str_tuple[0]
replace_str = check_str_tuple[1]
if (check_str in line):
line = line.replace(check_str, replace_str)
if (debug_output):
print ("\tReplacing [" + check_str + "] with [" + replace_str + "]")
new_content += line
template_svg_file.close()
new_SVG_file = io.open(svg_file, 'w', encoding=character_encoding)
new_SVG_file.write(new_content)
new_SVG_file.close()
replace_png_inside_svg(r"c:\data\cwd\my_cool_vector.svg", r"c:\data\cwd\googlelogo_color_92x30dp.png")

ipywidgets - widgets.FileUpload, updated CSV file read the CSV file

I am using jupyterhub and hosting the .ipynb file and hosted on server. I have usecase to upload a CSV from localdrive file and read the same for other dataframe tasks.
uploader = widgets.FileUpload(
accept='*.csv', # Accepted file extension e.g. '.txt', '.pdf', 'image/*', 'image/*,.pdf'
multiple=False # True to accept multiple files upload else False
)
display(uploader)
[input_file] = uploader.value
print(input_file)
pd.read_csv(input_file)
print(input_file) - is printing Test.csv which is CSV file name
I am able to print [input_file] but `pd.read_csv(input_file)' is throwing below error
FileNotFoundError: [Errno 2] No such file or directory: 'Test.csv'
Not sure were the CSV is uploaded, how can i read that data. Please help.
I don't have your exact ipywidgets version, but can you try this:
input_file = list(uploader.value.values())[0]
content = input_file['content']
content = io.StringIO(content.decode('utf-8'))
df = pd.read_csv(content)

Download a file from the internet in python?

I am having a problem naming the file I have downloaded and assigning it a file type. (see on line 4 of the code).
Saving the file in any location, I specifically assign to it. not in the same directory as the program. e.g in Downloads, documents, or any directory.
import requests
downloadUrl = input('Enter URL: ')
file_url = requests.get(downloadUrl)
dir_path = input("Enter path and name of the file: ")
f = open(dir_path,'a+')
Any inputs?
You are opening a requests object not a file path
file_url = requests.get(downloadUrl)
with open(file_url, 'w+') as f:
You would need to open a filepath on disk somewhere and write to that
See Using the request Module on https://stackabuse.com/download-files-with-python/
EDIT: From comments on post
In [584]: link = "https://www.annualreports.com/HostedData/AnnualReportArchive/3/NASDAQ_QFIN_2018.pdf"
In [585]: import requests
In [586]: file_url = requests.get(link)
In [587]: with open('/tmp/NASDAQ_QFIN_2018.pdf', 'wb') as f:
...: f.write(file_url.content)
...:
In [588]: ls -al /tmp/NASDAQ_QFIN_2018.pdf
-rw-r--r-- 1 ME wheel 1796199 9 Apr 15:21 /tmp/NASDAQ_QFIN_2018.pdf

Failed to Open Display error while running python subprocess and Libreoffice 6.2 in CentOS6

I need to run Libreoffice --covert-to to convert an excel file to html page.
The current code works well within the windows machine. But in AWS, CentOS6, it's producing an error as "Failed to open display".
Here is my code
import subprocess
cwdir = '/opt/libreoffice6.2/program'
excel_path = '/home/ec2-user/PythonCode/testing/Book1.xlsx'
dest_path = '/home/ec2-user/PythonCode/testing'
html_command = ["soffice", "--headless", "--convert-to", "html", "--outdir",
dest_path, excel_path]
subprocess.run(html_command, shell=True, cwd=cwdir)`
I was able to convert excel file to html file with some minor changes. The values inside list is not working so I have to provide the soffice convert-to query as a string value and the folder contains the input excel file and destination folder , where the html file to be stored,must be different.
import subprocess
cwdir = '/opt/libreoffice6.2/program'
excel_path = '/home/ec2-user/PythonCode/testing/Book1.xlsx'
dest_path = '/home/ec2-user/PythonCode/testing/output'
html_command = r"soffice --headless --convert-to html --outdir " + " " + dest_path + " " + excel_path
subprocess.run(html_command, shell=True, cwd=cwdir)

Resources