I need to download a file from an url and use a put request to upload it somewhere else
Download is done with
r=requests.get(image_url, auth=HTTPBasicAuth( user , password))
header_content_type = r.headers.get('content-type')
fileType = header_content_type.split('/')[-1]
content_type = header_content_type.split(';')[-1]
file_extension = fileType.split(';',1)[0]
file_name = file_id+'.' + file_extension
open('downloads/' + file_name , 'wb').write(r.content)
which works fine and stores the file locally in the downloads folder.
I can open the image with any image viewer and it works fine.
the put request needs to look like
{ "data":"gsddfgdsfg...(base64) ", "filename":"example2.txt", "contentType":"plain/text" }
I have tried to do it like following
def build_step_attachment_json(path, filename, contentype):
with open(path+filename) as f:
encoded = base64.b64encode(f.read())
return '{ "data":"'+ encoded + '", "filename":"' + filename +'", "contentType":" '+ contentype + '" }'
but it fails with:
"UnicodeDecodeError: 'charmap' codec can't decode byte 0x9d in position 44: character maps to "
Related
I am in the process of writting a python script that downloads a copy of quarantined emails from our gateway. The emails are in .eml format ( text ) so was thinking this would be easy but the resulting file does not open in Outlook properly due to the newlines added .
Here is the download/write function :
def download_message(api_key,endpoint,id):
endpoint = endpoint + "email/" + id
headers = {
'x-fireeye-api-key': api_key,
'content-type': 'application/json',
'Accept': 'application/json'}
data = {}
r = requests.get(endpoint, headers=headers, data=json.dumps(data))
if "not found in quarantine." in r.text:
print("Not found in Quarantine")
else:
filename = base_directory + id + ".eml"
f = open(filename,"w")
f.write(r.text)
f.close()
print("Written : " + id + ".eml" + " to disk")
Here is an example of the output file when opened in a text editor
When opened in Outlook this is what it looks like :
If i manually remove all those blank lines ( regex : ^\n ) and save the file it works as expected.
I have tried quite a few ways of removing those blank lines including strip , rstrip , re.sub and nothing seems to have worked.
If it helps what i was trying to do was create a new variable to hold the "modified" text and then pass that to the write function.
would have looked something like this ( sorry i have tried loads of variations which i have saved but i think you will get the point )
filedata = r.body.strip("\n") or filedata = re.sub('^\n' , "" , r.text)
...
f.write(filedata)
Can anyone help ?
So I have a specific need to download and extract a cab file but the size of each cab file is huge > 200MB. I wanted to selectively download files from the cab as rest of the data is useless.
Done so much so far :
Request 1% of the file from the server. Get the headers and parse them.
Get the files list, their offsets according to This CAB Link.
Send a GET request to server with the Range header set to the file Offset and the Offset+Size.
I am able to get the response but it is in a way "Unreadable" cause it is compressed (LZX:21 - Acc to 7Zip)
Unable to decompress using zlib. Throws invlid header.
Also I did not quite understand nor could trace the CFFOLDER or CFDATA as shown in the example cause its uncompressed.
totalByteArray =b''
eofiles =0
def GetCabMetaData(stream):
global eofiles
cabMetaData={}
try:
cabMetaData["CabFormat"] = stream[0:4].decode('ANSI')
cabMetaData["CabSize"] = struct.unpack("<L",stream[8:12])[0]
cabMetaData["FilesOffset"] = struct.unpack("<L",stream[16:20])[0]
cabMetaData["NoOfFolders"] = struct.unpack("<H",stream[26:28])[0]
cabMetaData["NoOfFiles"] = struct.unpack("<H",stream[28:30])[0]
# skip 30,32,34,35
cabMetaData["Files"]= {}
cabMetaData["Folders"]= {}
baseOffset = cabMetaData["FilesOffset"]
internalOffset = 0
for i in range(0,cabMetaData["NoOfFiles"]):
fileDetails = {}
fileDetails["Size"] = struct.unpack("<L",stream[baseOffset+internalOffset:][:4])[0]
fileDetails["UnpackedStartOffset"] = struct.unpack("<L",stream[baseOffset+internalOffset+4:][:4])[0]
fileDetails["FolderIndex"] = struct.unpack("<H",stream[baseOffset+internalOffset+8:][:2])[0]
fileDetails["Date"] = struct.unpack("<H",stream[baseOffset+internalOffset+10:][:2])[0]
fileDetails["Time"] = struct.unpack("<H",stream[baseOffset+internalOffset+12:][:2])[0]
fileDetails["Attrib"] = struct.unpack("<H",stream[baseOffset+internalOffset+14:][:2])[0]
fileName =''
for j in range(0,len(stream)):
if(chr(stream[baseOffset+internalOffset+16 +j])!='\x00'):
fileName +=chr(stream[baseOffset+internalOffset+16 +j])
else:
break
internalOffset += 16+j+1
cabMetaData["Files"][fileName] = (fileDetails.copy())
eofiles = baseOffset + internalOffset
except Exception as e:
print(e)
pass
print(cabMetaData["CabSize"])
return cabMetaData
def GetFileSize(url):
resp = requests.head(url)
return int(resp.headers["Content-Length"])
def GetCABHeader(url):
global totalByteArray
size = GetFileSize(url)
newSize ="bytes=0-"+ str(int(0.01*size))
totalByteArray = b''
cabHeader= requests.get(url,headers={"Range":newSize},stream=True)
for chunk in cabHeader.iter_content(chunk_size=1024):
totalByteArray += chunk
def DownloadInfFile(baseUrl,InfFileData,InfFileName):
global totalByteArray,eofiles
if(not os.path.exists("infs")):
os.mkdir("infs")
baseCabName = baseUrl[baseUrl.rfind("/"):]
baseCabName = baseCabName.replace(".","_")
if(not os.path.exists("infs\\" + baseCabName)):
os.mkdir("infs\\"+baseCabName)
fileBytes = b''
newRange = "bytes=" + str(eofiles+InfFileData["UnpackedStartOffset"] ) + "-" + str(eofiles+InfFileData["UnpackedStartOffset"]+InfFileData["Size"] )
data = requests.get(baseUrl,headers={"Range":newRange},stream=True)
with open("infs\\"+baseCabName +"\\" + InfFileName ,"wb") as f:
for chunk in data.iter_content(chunk_size=1024):
fileBytes +=chunk
f.write(fileBytes)
f.flush()
print("Saved File " + InfFileName)
pass
def main(url):
GetCABHeader(url)
cabMetaData = GetCabMetaData(totalByteArray)
for fileName,data in cabMetaData["Files"].items():
if(fileName.endswith(".txt")):
DownloadInfFile(url,data,fileName)
main("http://path-to-some-cabinet.cab")
All the file details are correct. I have verified them.
Any guidance will be appreciated. Am I doing it wrong? Another way perhaps?
P.S : Already Looked into This Post
First, the data in the CAB is raw deflate, not zlib-wrapped deflate. So you need to ask zlib's inflate() to decode raw deflate with a negative windowBits value on initialization.
Second, the CAB format does not exactly use standard deflate, in that the 32K sliding window dictionary carries from one block to the next. You'd need to use inflateSetDictionary() to set the dictionary at the start of each block using the last 32K decompressed from the last block.
I am trying to upload a blob (pdf) file from my laptop to a container in Azure storage account. I found it to be working but with one glitch.
I am calculating the file size using:
f_info = os.stat(file_path)
file_size = (f_info.st_size) # returns - 19337
Then I insert this value in below canonicalized header:
ch = "PUT\n\n\n"+str(file_size)+"\n\napplication/pdf\n\n\n\n\n\n\nx-ms-blob-type:BlockBlob" + "\nx-ms-date:" + date + "\nx-ms-version:" + version + "\n"
and send the PUT request to PUT Blob API, however, it returns an error saying, "Authentication failed because the server used below below string to calculate the signature"
\'PUT\n\n\n19497\n\napplication/pdf\n\n\n\n\n\n\nx-ms-blob-type:BlockBlob\nx-ms-date:[date]\nx-ms-version:[API version]
Looking at this string it obvious that authentication failed because file size which azure calculated returns a different value! I don't understand how its calculating this value of file size?!?!
FYI: If I replace 19337 with 19497 in canonicalized string and re run. It works!
Any suggestion on where I am making mistakes?
Below is the code:
storage_AccountName = '<storage account name>'
storage_ContainerName = "<container_name>"
storageKey='<key>'
fd = "C:\\<path>\\<to>\\<file_to_upload>.pdf"
URI = 'https://' + storageAccountName + '.blob.core.windows.net/<storage_ContainerName >/<blob_file_name.pdf>
version = '2017-07-29'
date = datetime.datetime.utcnow().strftime("%a, %d %b %Y %H:%M:%S GMT")
if os.path.isfile(fd):
file_info = os.stat(fd)
file_size = (file_info.st_size)
ch = "PUT\n\n\n"+str(file_size)+"\n\napplication/pdf\n\n\n\n\n\n\nx-ms-blob-type:BlockBlob" + "\nx-ms-date:" + date + "\nx-ms-version:" + version + "\n"
cr = "/<storage_AccountName>/<storage_Containername>/<blob_file_name.pdf>"
canonicalizedString = ch + cr
storage_account_key = base64.b64decode(storageKey)
byte_canonicalizedString=canonicalizedString.encode('utf-8')
signature = base64.b64encode(hmac.new(key=storage_account_key, msg=byte_canonicalizedString, digestmod=hashlib.sha256).digest())
header = {
'x-ms-blob-type': "BlockBlob",
'x-ms-date': date,
'x-ms-version': version,
'Authorization': 'SharedKey ' + storageAccountName + ':' + signature.decode('utf-8'),
#'Content-Length': str(19497), # works
'Content-Length': str(file_size), # doesn't work
'Content-Type': "application/pdf"}
files = {'file': open(fd, 'rb')}
result = requests.put(url = URI, headers = header, files = files)
print (result.content)
As mentioned in the comments, the reason you're getting the content length mismatched header is because instead of uploading the file, you're uploading an object which contains file contents and that is causing the content length to increase.
Please change the following line of codes:
files = {'file': open(fd, 'rb')}
result = requests.put(url = URI, headers = header, files = files)
to something like:
data = open(fd, 'rb') as stream
result = requests.put(url = URI, headers = header, data = data)
And now you're only uploading the file contents.
I scraped a website containing hebrew characters and saved the data as a txt file.
When I open it in PyCharm it presents it in Hebrew:
"name":"פראמול אף ושיעול LIFE"}
</script><base href="https://shop.super-pharm.co.il/pharmacy/cold/flu/PARAMOL-AF-AND-SHIUL20%2B30/p/254954"/>
<title>
LIFE - פראמול אף ושיעול | סופר-פארם</title>
but when I open it in a notepad it presesnts it in chinese:
籘̉㰀瑨汭挠慬獳∽樠獣瑳慲獮瑩潩獮•楤㵲爢汴•慬杮∽敨㸢格慥㹤㰠慢敳栠敲㵦栢瑴獰⼺猯潨畳数桰牡潣椮⽬慮畴敲猯数楣污昭牯畭慬⽳楤瑥䰯䙉ⵅ佈䑏䅉匭䥌䵍剅⽓⽰〵〶㘷⼢ਾ琼瑩敬ਾ
When I open it with the open() command it presents the info as gibberish even when I use the encode() command
What's the problem?
file_name = str(x) + '_' + save_file_name + '_superpharm'
file_out = open(save_file_name + '/' + file_out, 'wb')
pickle.dump(strsoup, file_out)
enter image description here
I use a python script to webscrape for "Show Notes" and an mp3. When I encounter a page that has no show notes, this means the show was a Best Of, so I want to skip the download of the notes and mp3. I am not sure where the best place to insert the test would be. The snippet is as follows:
for show_html in showpage_htmls:
try:
p_html = s.get(show_html)
p_soup = BeautifulSoup(p_html.content, 'html.parser')
# set title for SHOW NOTES
title = ''
title = p_soup.title.contents[0]
# get SHOW NOTES chunk and remove unwanted characters (original mp3notes not changed)
mp3notes = ''
mp3notes = p_soup.find('div', {'class': 'module-text'}).find('div')
mp3notes = str(title) + str('\n') + str(mp3notes).replace('<div>','').replace('<h2>','').replace('</h2>','\n').replace('<p>','').replace('<br/>\n','\n').replace('<br/>','\n').replace('</p>','').replace('</div>','').replace('\u2032','')
# FIXME need to skip d/l if no notes
# set basename, mp3named and mp3showtxt
mp3basename = '{0}{1}{2}'.format(show_html.split('/')[3],show_html.split('/')[4],show_html.split('/')[5])
if (os.name == 'nt'):
mp3showtxt = mp3dir + '\\' + mp3basename + '.txt'
mp3named = mp3dir + '\\' + mp3basename + '.mp3'
else:
mp3showtxt = mp3dir + '/' + mp3basename + '.txt'
mp3named = mp3dir + '/' + mp3basename + '.mp3'
# save show notes to local
with open(mp3showtxt, 'w') as f:
try:
f.write(mp3notes)
print("Show notes " + mp3basename + " saved.")
except UnicodeEncodeError:
print("A charmap encoding ERROR occurred.")
print("Show notes for " + mp3basename + ".mp3 FAILED, but continuing")
finally:
f.close()
# FIXME need eyed3 to set mp3 tags since B&T are lazy
# get Full Show mp3 link
mp3url = p_soup.find('a', href = True, string = 'Full Show').get('href')
# get and save mp3
r = requests.get(mp3url)
with open(mp3named, 'wb') as f:
f.write(r.content)
print("Downloaded " + mp3basename + ".mp3.")
except AttributeError:
print(show_html + " did not exist as named.")
I would think an
if not (len(mp3notes) >= 50)
would work; just not sure where to put it or there is better way (more Pythonic).
Ideally, if the mp3notes are less than expected, no notes or mp3 for that show_html would be saved, and the script would start at the next show_html page.
Since I am new to Python, feel free to offer suggestions to making this more Pythonic as well; I am here to learn! Thanks.