How to decode an encoded zipfile using Python? - base64

I have a base64 encoded zip file. I am able to convert that zip file and then extract its content using Windows commandline. I have been trying to do the same with Python, but unsuccessful. Could you please help me?
When I run the following code:
import base64
import codecs
import zlib
import io, zipfile, json, pprint
d = open("data.txt", "rb").read()
#dd = base64.decodestring(d)
#print(dd)
z = zipfile.ZipFile(io.BytesIO(d))
unpack = zlib.decompress(d)
I get the following error:
raise BadZipFile("File is not a zip file") zipfile.BadZipFile: File is
not a zip file
The data.txt file contains the base64 string:

A friend of mine helped me. I thought posting the solution here might help a lot of beginners like me:
def convert(d,name, ex):
with open('output_file.zip', 'wb') as result:
result.write(base64.b64decode(d))
zip_ref = zipfile.ZipFile("output_file.zip", 'r')
zip_ref.extractall("extracted_file")
zip_ref.close()
for filename in os.listdir("extracted_file"):
extracted_file = "extracted_file/"+filename
shutil.move(extracted_file, "images/"+name+ex)

Related

PyPDF2 append_function can't find file

I want to use the PyPDF2 module to merge PDFs.
The following code works fine:
import PyPDF2
import sys
import os
input_path = r'\Users\XXXXX\OneDrive\Desktop\PDF_File_Input'
merger = PyPDF2.PdfFileMerger()
for file in os.listdir(input_path):
if file.endswith(".pdf"):
print(file)
As soon as I implement the append function I'm getting a traceback error from line 10. FileNotFoundError: [Errno 2] No such file or directory: 'abc.pdf'
import PyPDF2
import sys
import os
input_path = r'\Users\XXXXX\OneDrive\Desktop\PDF_File_Input'
merger = PyPDF2.PdfFileMerger()
for file in os.listdir(input_path):
if file.endswith(".pdf"):
merger.append(file)
merger.write("combined_file.pdf")
I don't understand why the file can be found via print but not by the append function.
If the path is correct, then most likely has something to do with the way you implement the Merger.
def run_mergepdf(args):
check_required(args, ["input", "output"])
print("Number of input files: {0}".format(len(args.input)))
#Preliminary checks
print("Checking read/write status")
check_files(args.input, action="r")
check_files([args.output], action="w")
#Join pdfs
print("Starting to merge PDFs")
merger = PdfFileMerger(strict=False)
for pdf in args.input:
if os.stat(pdf).st_size != 0:
merger.append(PdfFileReader(pdf))
print("Writing merged file: {0}".format(args.output))
merger.write(args.output)
print("PDFs merged successfully!")

I'm trying to use Python to open zip files, but getting a "BadZipFile: File is not a zip file" Error

I'm trying to open files from this URL: https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/California/
It's telling me it's not a zip file. I've searched other cases, but the situations are so varied it's hard to determine the course of action.
The following is my code:
import pandas as pd
import os
import numpy as np
from zipfile import ZipFile
from urllib.request import urlopen
zipurl = 'https://www2.census.gov/programs-surveys/decennial/2020/data/01-Redistricting_File--PL_94-171/California/'
zipresp = urlopen(zipurl)
tempzip = open("C:/Temp/tempfile.zip", "wb")
tempzip.write(zipresp.read())
tempzip.close()
zf = ZipFile("C:/Temp/tempfile.zip")
zf.extractall(path = save_path)
zf.close()
I get the following error:
BadZipFile: File is not a zip file
Thanks much.
My Python version is 3.7 and my Spyder Version is 3.3.6.

Running "replace contractions" across all text files in same directory and output to multiple text files in another directory

I am a beginner user of Python.
I am writing a Python script using "replace contractions" to replace contractions in all text files in the same directory with the expanded words, and then outputting the replaced files to another directory.
The code looks like the following at present:
import re, string, unicodedata
import nltk
import contractions
import inflect
import os
txt_files = [f for f in os.listdir('./test') if f.endswith('.txt')]
fd = open(txt_files)
with open(txt_files)as fd:
fd.method
fd.close()
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
output_strings = map(replace_contractions, txt_files)
output_content = "".join(sorted(output_strings)) # sort join the output strings without separators
# write to file
with open(folder_path + output_filename, 'wt') as outfile:
outfile.write(output_content)
The error which I have received is:
"Traceback (most recent call last):
File "C:\Users\User\Desktop\Text Preprocessing.py", line 9, in <module>
fd = open(txt_files)
TypeError: invalid file: ['1.txt', '2.txt']"
Can anyone advise me on resolving the error? Thank you!
I have now edited my code to the following:
import re, string, unicodedata
import nltk
import contractions
import inflect
import os
txt_files = [f for f in os.listdir('./test') if f.endswith('.txt')]
import glob
for each_file in glob.glob("arc\.\d+\.txt"):
print(each_file)
def replace_contractions(text):
"""Replace contractions in string of text"""
return contractions.fix(text)
output_strings = map(replace_contractions, txt_files)
output_content = "".join(sorted(output_strings)) # sort join the output strings without separators
# write to file
folder_path = 'C:\\Users\\User\\Desktop\\test1\\'
output_filename = os.path.join(folder_path, '.txt')
with open(output_filename, 'wt') as outfile:
outfile.write(output_content)
There is no error. But I have 2 output files. The first is a text file with the string "1.txt2.txt" inside the text file and the second file has a filename as an underscore and is without any extension. I am not getting the desired output in the txt files, i.e. to expand the contractions in the text inside the txt files. Can anyone help out?

Create PDF from lxml

I am converting docx files to pdf. Currently I'm converting the docx to txt file, and then writing the txt file to a pdf.
But I would like to convert the docx directly to a pdf from the parsed lxml (maintaining the lxml structure/formatting).
Is there a streamlined way to do this?
Current docx to pdf conversion:
from shutil import copyfile, rmtree
import sys
import os
import zipfile
from lxml import etree
zip_dir = sys.argv[1]
zip_dir_zip_ext = os.path.splitext(zip_dir)[0] + '.zip'
copyfile(zip_dir, zip_dir_zip_ext)
zip_ref = zipfile.ZipFile(zip_dir_zip_ext, 'r')
zip_ref.extractall('./temp')
data = etree.parse('./temp/word/document.xml')
result = [node.text.strip() for node in data.xpath("//w:t", namespaces={'w':'http://schemas.openxmlformats.org/wordprocessingml/2006/main'})]
import codecs
with codecs.open(os.path.splitext(zip_dir)[0]+'_converted_temp.txt', 'w', 'UTF-8') as txt:
joined_result = '\n'.join(result)
txt.write(joined_result)
zip_ref.close()
rmtree('./temp')
os.remove(zip_dir_zip_ext)
Inspiration: How do I write a python script that can read doc/docx files and convert them to txt?

UnicodeDecodeError Python 3.5.1 Email Script

I am attempting to send an email + attachment to an SMS gateway email. However I currently am getting a Unicode Decode: Error'Charmap' codec can't Decode Byte 0x8d in position 60
I'm not sure how to go about fixing this and would be interested in your advice. Bellow is my code and the Full Error.
import smtplib, os
from email.mime.image import MIMEImage
from email.mime.multipart import MIMEMultipart
msg = MIMEMultipart()
msg['Subject'] = 'Cuteness'
msg['From'] = 'sample#outlook.com'
msg['To'] = '111111111#messaging.sprintpcs.com'
msg.preamble = "Would you pet me? I'd Pet me so hard..."
here = os.getcwd()
file = open('cutecat.png')#the png shares directory with actual script
for here in file: #Error appears to be in this section
with open(file, 'rb') as fp:
img = MIMImage(fp.read())
msg.attach(img)
s = smtplib.SMTP('Localhost')
s.send_message(msg)
s.quit()
""" Traceback (most recent call last):
File "C:\Users\Thomas\Desktop\Test_Box\msgr.py", line 16, in <module>
for here in file:
File "C:\Users\Thomas\AppData\Local\Programs\Python\Python35-32\lib\encodings\cp1252.py", line 23, in decode
return codecs.charmap_decode(input,self.errors,decoding_table)[0]
UnicodeDecodeError: 'charmap' codec can't decode byte 0x8d in position 60: character maps to <undefined>"""
You're trying to open the file twice. First you have:
file = open('cutecat.png')
The default mode to open files is to read them in text mode. That is generally not what you want to do with a binary file like a PNG file.
And then you do:
for here in file:
with open(file, 'rb') as fp:
img = MIMImage(fp.read())
msg.attach(img)
You get an exception in the first line because Python is trying to decode the contents of a binary file as text and fails. The chances of this happening are quite high. It is unlikely that a binary file is also a valid text file in your standard encoding.
But even if that would have worked, for every line in the file you try to open the file again? This makes no sense!
Were you just copy/pasting from the examples, especially the third one? You should note that this example is incomplete. The variable pngfiles used in that example (and which should be a sequence of file names) is not defined.
Try this instead:
with open('cutecat.png', 'rb') as fp:
img = MIMImage(fp.read())
msg.attach(img)
Or if you want to include multiple files:
pngfiles = ('cutecat.png', 'kitten.png')
for filename in pngfiles:
with open(filename, 'rb') as fp:
img = MIMImage(fp.read())
msg.attach(img)

Resources