Short context/goal:
We get those xml.p7m files from our Italian creditors via mail, and have to process them, so we can pay those.
The goal is to automate this process, so people don't have to manually validate the data.
If we don't get all the data we need to automatically process the invoice, we create a pdf so people can validate the data.
The Problem:
(I can't attach the full xml.pm7 because of all those characters in there that shouldn't be there, so I did it via picture, but the relevant tag can be copy and pasted)
I don't speak Italian, so it's hard for me to find the
information I need to continue with this task
Since we cant parse the .p7m files with an xml parser (not well
formed, obviously), I have to remove all the unneeded stuff, so we can parse the xml part. Which works in (so far) ~ 450 of 532 mails. With "manually replacing error tags" (see below) we reach for those a 100%, but adding exceptions every now and then isn't the correct way to handle this:
The relevant code:
# Some p7m have some things like ♦♥ in it, they need to be removed (and while we're at it, remove the signature so it's well formed)
# Special Characters which translate to --> \x04\xc2\x82\x03\xc3\xa8
# <Nazione>IT</Nazione>♦♥è
# <Descrizione>nr ordine 9♦♥è303067091</Descrizione>
# <NumeroLinea>6<\Numero♦♥èLinea>
# <Quant♦♥èita>0.00</Quantita>
# </Anagraf♦♥èica>
def getXmlTextRemovedSignature(path, filePath, m_iv_id, mailSubject, amountOfAttachments, logger):
txt = ""
parseError = 0
try:
with open(filePath, encoding='latin-1') as f:
try:
txt = f.read()
except Exception as e:
logger.errorLog("Couldn't read file text: {}".format(e))
# no opening tag to find --> no xml --> decode the file, save it, and get the text
if not re.findall('<',txt):
image_64_decode = base64.decodebytes(txt.encode())
image_result = open(path + 'decoded.xml', 'wb') # create a writable image and write the decoding result
image_result.write(image_64_decode)
image_result.close()
txt = open(path + 'decoded.xml', encoding='latin-1').read()
try:
txt = re.sub(r'[^A-Za-z0-9<>\'\+\-\,\?\"\.\=\/\:\r\n\t\!\[\] {1,}]', '', txt)
if re.findall(r'<\?xml', txt):
txt = '<?xml' + re.split(r'<\?xml',txt,1)[1]
else:
tag = re.findall(r'<.{0,5}FatturaElettronica', txt)[0]
txt = tag + re.split(tag, txt,1)[1]
if re.findall(r'<\/.{0,5}FatturaElettronica>', txt):
tag = re.findall(r'<\/.{0,5}FatturaElettronica>', txt)[0]
txt = re.split(tag, txt, 1)[0] + tag
except Exception as e:
logger.errorLog("Error while removing the signature, m_iv_id = {}, mailsubject: {}, error: {}".format(m_iv_id, mailSubject, e))
# Things to replace
# txt = txt.replace('PDatiRiepilogo','DatiRiepilogo')
# txt = txt.replace('DatiRiepiDlogo', 'DatiRiepilogo')
# txt = txt.replace('DatiRiepiloBgo', 'DatiRiepilogo')
# txt = txt.replace('j/DatiRiepilogo', '/DatiRiepilogo')
# txt = txt.replace('AliquotIaIVA', 'AliquotaIVA')
# txt = txt.replace('AliquotaIVA', 'AliquotaIVA')
# txt = txt.replace('AliquoJtaIVA', 'AliquotaIVA')
# txt = txt.replace('AliquotaHIVA', 'AliquotaIVA')
# txt = txt.replace('AliquotaGIVA', 'AliquotaIVA')
# txt = txt.replace('cAliquotaIVA', 'AliquotaIVA')
# txt = txt.replace('UnitaMiEsura', 'UnitaMisura')
# txt = txt.replace('UnitaMisuraE', 'UnitaMisura')
# txt = txt.replace('De.ttaglioLinee', 'DettaglioLinee')
# txt = txt.replace('DettaglioLin-ee', 'DettaglioLinee')
# txt = txt.replace('Dett?aglioLinee', 'DettaglioLinee')
# txt = txt.replace('DePttaglioLinee', 'DettaglioLinee')
# txt = txt.replace('DettaglioLiEnee', 'DettaglioLinee')
# txt = txt.replace('DettaglioLinee+', 'DettaglioLinee')
# txt = txt.replace('Dettaglio0Linee', 'DettaglioLinee')
# txt = txt.replace('ImponibileImportoS', 'ImponibileImporto')
# txt = txt.replace('SNumeroLinea', 'NumeroLinea')
# txt = txt.replace('FatturaElettronic!aBody', 'FatturaElettronicaBody')
# txt = txt.replace('/FatturaEle ttronica', '/FatturaElettronica')
# txt = txt.replace('SpeseAccessorXie', 'SpeseAccessorie')
# txt = txt.replace('Quantit]a', 'Quantita')
# txt = txt.replace('DatiPaga4mento]a', 'DatiPagamento')
# txt = txt.replace('DatiPaga4mento', 'DatiPagamento')
# txt = txt.replace('IXmposta', 'Imposta')
# txt = txt.replace('DataScfadenzaPagamento', 'DataScadenzaPagamento')
# txt = txt.replace('DataScadenzaPagame/nto', 'DataScadenzaPagamento')
# txt = txt.replace('Dat=aScadenzaPagamento', 'DataScadenzaPagamento')
# txt = txt.replace('RiferimLentoTesto', 'RiferimentoTesto')
# txt = txt.replace('CodiceTipoN', 'CodiceTipo')
# txt = txt.replace('D[ettaglioPagamento', 'DettaglioPagamento')
# txt = txt.replace('DatiBeniServizi', 'DatiBeniServizi')
# txt = txt.replace('AttachmeBnt', 'Attachment')
# txt = txt.replace('ModalitaPagamhento', 'ModalitaPagamento')
# txt = txt.replace('ModalXitaPagamento', 'ModalitaPagamento')
try:
ET.fromstring(txt)
except Exception as e:
print(parseError)
parseError = 3
logger.errorLog("Couldn't parse xml, m_iv_id = {}, mailsubject: {}, error: {}".format(m_iv_id, mailSubject, e))
if txt:
amountOfAttachments[0]+=1
return {"xml" : txt, "parseError" : parseError}
except Exception as e:
logger.errorLog("Error while removing the signature, m_iv_id = {}, mailsubject: {}, path: {}, error: {}".format(m_iv_id, mailSubject, path, e))
p7m example:
0� *�H��
��0�10
`�He
<?xml version="1.0" encoding="utf-8"?>
<p:FatturaElettronica xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:ds="http://www.w3.org/2000/09/xmldsig#" versione="FPR12" xmlns:p="http://ivaservizi.agenziaentrate.gov.it/docs/xsd/fatture/v1.2">
<FatturaElettronicaHeader>
<DatiTrasmissione>
<IdTrasmittente>
<IdPaese>IT</IdPaese>
<IdCodice>02002750483</IdCodice>
</IdTrasmittente>
<ProgressivoInvio>000YP</ProgressivoInvio>
<FormatoTrasmissione>FPR12</FormatoTrasmissione>
<CodiceDestinatario>0000000</CodiceDestinatario>
<PECDestinatario>CENSORED</PECDestinatario>
</DatiTrasmissione>
<CedentePrestatore>
<DatiAnagrafici>
<IdFiscaleIVA>
<IdPaese>IT</IdPaese>
<IdCodice>CENSORED</IdCodice>
</IdFiscaleIVA>
<Anagrafica>
<Denominazione>CENSORED</Denominazione>
</Anagrafica>
<RegimeFiscale>RF01��</RegimeFiscale>
</DatiAnagrafici>
<Sede>
<Indirizzo>CENSORED</Indirizzo>
<CAP>70128</CAP>
<Comune>Bari</Comune>
<Provincia>BA</Provincia>
<Nazione>IT</Nazione>
</Sede>
<IscrizioneREA>
<Ufficio>BA</Ufficio>
<NumeroREA>482324</NumeroREA>
<CapitaleSociale>300000.00</CapitaleSociale>
<SocioUnico>SM</SocioUnico>
<StatoLiquidazione>LN</StatoLiquidazione>
</IscrizioneREA>
</CedentePrestatore>
<CessionarioCommittente>
<DatiAnagrafici>
<IdFiscaleIVA>
<IdPaese>IT</IdPaese>
<IdCodice>CENSORED</IdCodice>
</IdFiscaleIVA>
<Anagrafica>
<Denominazione>CENSORED </Denominazione>
</Anagrafica>
</DatiAnagrafici>
<Sede>
<Indirizzo>CENSORED</Indirizzo>
<CAP>39100</CAP>
<Comune>Bolzano</Comune>
<Pro��vincia>BZ</Provincia>
<Nazione>IT</Nazione>
</Sede>
</CessionarioCommittente>
<TerzoIntermediarioOSoggettoEmittente>
<DatiAnagrafici>
<IdFiscaleIVA>
<IdPaese>IT</IdPaese>
<IdCodice>CENSORED</IdCodice>
</IdFiscaleIVA>
<CodiceFiscale>CENSORED</CodiceFiscale>
<Anagrafica>
<Denominazione>CENSORED</Denominazione>
</Anagrafica>
</DatiAnagrafici>
</TerzoIntermediarioOSoggettoEmittente>
<SoggettoEmittente>TZ</SoggettoEmittente>
</FatturaElettronicaHeader>
<FatturaElettronicaBody>
<DatiGenerali>
<DatiGeneraliDocumento>
<TipoDocumento>TD01</TipoDocumento>
<Divisa>EUR</Divisa>
<Data>2021-09-07</Data>
<Numero>288/2021</Numero>
<ImportoTotaleDocumento>101.27</ImportoTotaleDocumento>
<Causale>OR. 9303258741
COD. 43583</Causale>
</DatiGeneraliDocumento>
</DatiGenerali>
�� <DatiBeniServizi>
<DettaglioLinee>
<NumeroLinea>1</NumeroLinea>
<CodiceArticolo>
<CodiceTipo>INTERNO</CodiceTipo>
<CodiceValore>OFF</CodiceValore>
</CodiceArticolo>
<Descrizione>LAVORO D'OFFICINA JEEP RENEGADE GE721ZJ KM 8763 </Descrizione>
<UnitaMisura>NR</UnitaMisura>
<PrezzoUnitario>0.0000</PrezzoUnitario>
<PrezzoTotale>0.0000</PrezzoTotale>
<AliquotaIVA>22.00</AliquotaIVA>
</DettaglioLinee>
<DettaglioLinee>
<NumeroLinea>2</NumeroLinea>
<CodiceArticolo>
<CodiceTipo>INTERNO</CodiceTipo>
<CodiceValore>OFF</CodiceValore>
</CodiceArticolo>
<Descrizione>SOSTITUZIONE FILTRO OLIO COD. 46337528 </Descrizione>
<Quantita>1.00</Quantita>
<UnitaMisura>NR</UnitaMisura>
<PrezzoUnitario>8.4900</PrezzoUnitario>
<PrezzoTotale>8.4900</PrezzoTotale>
<AliquotaIVA>22.00</AliquotaIVA>
�� </DettaglioLinee>
<DettaglioLinee>
<NumeroLinea>3</NumeroLinea>
<CodiceArticolo>
<CodiceTipo>INTERNO</CodiceTipo>
<CodiceValore>OFF</CodiceValore>
</CodiceArticolo>
<Descrizione>OLIO MOTORE 5W-30 4,5 LITRI </Descrizione>
<Quantita>1.00</Quantita>
<UnitaMisura>NR</UnitaMisura>
<PrezzoUnitario>51.5200</PrezzoUnitario>
<PrezzoTotale>51.5200</PrezzoTotale>
<AliquotaIVA>22.00</AliquotaIVA>
</DettaglioLinee>
<DettaglioLinee>
<NumeroLinea>4</NumeroLinea>
<CodiceArticolo>
<CodiceTipo>INTERNO</CodiceTipo>
<CodiceValore>OFF</CodiceValore>
</CodiceArticolo>
<Descrizione>LAVORO DI MESSA IN OPERA </Descrizione>
<Quantita>1.00</Quantita>
<UnitaMisura>NR</UnitaMisura>
<PrezzoUnitario>23.0000</PrezzoUnitario>
<PrezzoTotale>23.0000</PrezzoTotale>
<AliquotaIVA>22.00</Aliquo�JtaIVA>
</DettaglioLinee>
<DatiRiepilogo>
<AliquotaIVA>22.00</AliquotaIVA>
<ImponibileImporto>83.01</ImponibileImporto>
<Imposta>18.26</Imposta>
<EsigibilitaIVA>I</EsigibilitaIVA>
</DatiRiepilogo>
</DatiBeniServizi>
</FatturaElettronicaBody>
</p:FatturaElettronica>
I can't copy and paste the full .p7m text, sadly, there is a lot more stuff after the closing tag like this:
after I used my function, I still get something like this:
<AliquotaIVA>22.00</AliquoJtaIVA>
AliquoJtaIVA
after I replaced the unwanted stuff from the original:
<AliquotaIVA>22.00</Aliquo�JtaIVA>
Can someone help me out in how to parse those xml.p7m with python(or another language, I really don't care at this point)?
I don't see a logic there, and I really don't want to maintain all the special cases for the next year or so (those would be in a database table btw., so we don't have to adjust the code all the time, but for testing purposes it's hard coded)
Edit:
Another Problem would be if those special characters would appear in a tag with numbers
Related
Below is the sample data in input file. I need to process this file and turn it into a csv file. With some help, I was able to convert it to csv file. However not fully converted to csv since I am not able to handle \n, junk line(2nd line) and blank line(4th line). Also, i need help to filter transaction_type i.e., avoid "rewrite" transaction_type
{"transaction_type": "new", "policynum": 4994949}
44uu094u4
{"transaction_type": "renewal", "policynum": 3848848,"reason": "Impressed with \n the Service"}
{"transaction_type": "cancel", "policynum": 49494949, "cancel_table":[{"cancel_cd": "AU"}, {"cancel_cd": "AA"}]}
{"transaction_type": "rewrite", "policynum": 5634549}
Below is the code
import ast
import csv
with open('test_policy', 'r') as in_f, open('test_policy.csv', 'w') as out_f:
data = in_f.readlines()
writer = csv.DictWriter(
out_f,
fieldnames=[
'transaction_type', 'policynum', 'cancel_cd','reason'],lineterminator='\n',
extrasaction='ignore')
writer.writeheader()
for row in data:
dict_row = ast.literal_eval(row)
if 'cancel_table' in dict_row:
cancel_table = dict_row['cancel_table']
cancel_cd= []
for cancel_row in cancel_table:
cancel_cd.append(cancel_row['cancel_cd'])
dict_row['cancel_cd'] = ','.join(cancel_cd)
writer.writerow(dict_row)
Below is my output not considering the junk line,blank line and transaction type "rewrite".
transaction_type,policynum,cancel_cd,reason
new,4994949,,
renewal,3848848,,"Impressed with
the Service"
cancel,49494949,"AU,AA",
Expected output
transaction_type,policynum,cancel_cd,reason
new,4994949,,
renewal,3848848,,"Impressed with the Service"
cancel,49494949,"AU,AA",
Hmm I try to fix them but I do not know how CSV file work, but my small knoll age will suggest you to run this code before to convert the file.
txt = {"transaction_type": "renewal",
"policynum": 3848848,
"reason": "Impressed with \n the Service"}
newTxt = {}
for i,j in txt.items():
# local var (temporar)
lastX = ""
correctJ = ""
# check if in J is ascii white space "\n" and get it out
if "\n" in f"b'{j}'":
j = j.replace("\n", "")
# for grammar purpose check if
# J have at least one space
if " " in str(j):
# if yes check it closer (one by one)
for x in ([j[y:y+1] for y in range(0, len(j), 1)]):
# if 2 spaces are consecutive pass the last one
if x == " " and lastX == " ":
pass
# if not update correctJ with new values
else:
correctJ += x
# remember what was the last value checked
lastX = x
# at the end make J to be the correctJ (just in case J has not grammar errors)
j = correctJ
# add the corrections to a new dictionary
newTxt[i]=j
# show the resoult
print(f"txt = {txt}\nnewTxt = {newTxt}")
Termina:
txt = {'transaction_type': 'renewal', 'policynum': 3848848, 'reason': 'Impressed with \n the Service'}
newTxt = {'transaction_type': 'renewal', 'policynum': 3848848, 'reason': 'Impressed with the Service'}
Process finished with exit code 0
I want to find specific lines in a file, add a string to end of that line, and then update the file, but the updated file has extra blank lines between the lines.
def Reading_Logging(PacketName, PacketTot, PacketNum):
try:
with open("C:\\Users\\Shakib\\Desktop\\test.txt", "r+") as f:
content = f.read().splitlines()
#print(names_list)
for i, l in enumerate(content):
tnow = datetime.datetime.now()
linesplit = l.split(',')
if linesplit[0] == PacketName and linesplit[1] == PacketTot and linesplit[2] == PacketNum:
content[i] = content[i].replace(content[i], content[i] + ',' + str(tnow))
with open("C:\\Users\\Shakib\\Desktop\\newtest.txt", "w") as f:
f.write('\n'.join(content))
I expect the following output without blank lines, but this is my real output:
ZoYt,97,0,3.394531,2019-07-27 14:40:27.671415,2019-07-27 19:22:48.824541
ZoYt,97,1,3.000977,2019-07-27 14:40:27.701415
ZoYt,97,2,1.879883,2019-07-27 14:40:27.731415
ZoYt,97,3,3.681641,2019-07-27 14:40:27.753415
ZoYt,97,4,1.069336,2019-07-27 14:40:27.760416
ZoYt,97,5,1.094727,2019-07-27 14:40:27.773417
ZoYt,97,6,3.077148,2019-07-27 14:40:27.787417
ZoYt,97,7,1.015625,2019-07-27 14:40:27.798418
ZoYt,97,8,3.765625,2019-07-27 14:40:27.813419
ZoYt,97,9,2.797852,2019-07-27 14:40:27.823419
ZoYt,97,10,3.860352,2019-07-27 14:40:27.837420
ZoYt,97,11,3.179688,2019-07-27 14:40:27.849421
I'm reading a binary file that has a code on STM32. I placed deliberate 2 const strings in the code, that allows me to read SW version and description from a given file.
When you open a binary file with hex editor or even in python3, you can see correct form. But when run text = data.decode('utf-8', errors='ignore'), it removes a zeros from the file! I don't want this, as I keep EOL characters to properly split and extract string that interest me.
(preview of the end of the data variable)
Svc\x00..\Src\adc.c\x00..\Src\can.c\x00defaultTask\x00Task_CANbus_receive\x00Task_LED_Controller\x00Task_LED1_TX\x00Task_LED2_RX\x00Task_PWM_Controller\x00**SW_VER:GN_1.01\x00\x00\x00\x00\x00\x00MODULE_DESC:generic_module\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00**Task_SuperVisor_Controller\x00\x00\x00\x00\x00\x00\x00\x00\x00\x01\x02\x03\x04\x06\x07\x08\t\x00\x00\x00\x00\x01\x02\x03\x04..\Src\tim.c\x005!\x00\x08\x11!\x00\x08\x01\x00\x00\x00\xaa\xaa\xaa\xaa\x01\x01\nd\x00\x02\x04\nd\x00\x00\x00\x00\xa2J\x04'
(preview of text, i.e. what I receive after decode)
r g # IDLE TmrQ Tmr Svc ..\Src\adc.c ..\Src\can.c
defaultTask Task_CANbus_receive Task_LED_Controller Task_LED1_TX
Task_LED2_RX Task_PWM_Controller SW_VER:GN_1.01
MODULE_DESC:generic_module
Task_SuperVisor_Controller ..\Src\tim.c 5! !
d d J
with open(path_to_file, "rb") as binary_file:
# Read the whole file at once
data = binary_file.read()
text = data.decode('utf-8', errors='ignore')
# get index of "SW_VER:" sting in the file
sw_ver_index = text.rfind("SW_VER:")
# SW_VER found
if sw_ver_index is not -1:
# retrive the value, e.g. "SW_VER:WB_2.01" will has to start from position 7 and finish at 14
sw_ver_value = text[sw_ver_index + 7:sw_ver_index + 14]
module.append(tuple(('DESC:', sw_ver_value)))
else:
# SW_VER not found
module.append(tuple(('DESC:', 'N/A')))
# get index of "MODULE_DESC::" sting in the file
module_desc_index = text.rfind("MODULE_DESC:")
# MODULE_DESC found
if module_desc_index is not -1:
module_desc_substring = text[module_desc_index + 12:]
module_desc_value = module_desc_substring.split()
module.append(tuple(('DESC:', module_desc_value[0])))
print(module_desc_value[0])
As you can see my white characters are gone, while they should be present
So I have a specific need to download and extract a cab file but the size of each cab file is huge > 200MB. I wanted to selectively download files from the cab as rest of the data is useless.
Done so much so far :
Request 1% of the file from the server. Get the headers and parse them.
Get the files list, their offsets according to This CAB Link.
Send a GET request to server with the Range header set to the file Offset and the Offset+Size.
I am able to get the response but it is in a way "Unreadable" cause it is compressed (LZX:21 - Acc to 7Zip)
Unable to decompress using zlib. Throws invlid header.
Also I did not quite understand nor could trace the CFFOLDER or CFDATA as shown in the example cause its uncompressed.
totalByteArray =b''
eofiles =0
def GetCabMetaData(stream):
global eofiles
cabMetaData={}
try:
cabMetaData["CabFormat"] = stream[0:4].decode('ANSI')
cabMetaData["CabSize"] = struct.unpack("<L",stream[8:12])[0]
cabMetaData["FilesOffset"] = struct.unpack("<L",stream[16:20])[0]
cabMetaData["NoOfFolders"] = struct.unpack("<H",stream[26:28])[0]
cabMetaData["NoOfFiles"] = struct.unpack("<H",stream[28:30])[0]
# skip 30,32,34,35
cabMetaData["Files"]= {}
cabMetaData["Folders"]= {}
baseOffset = cabMetaData["FilesOffset"]
internalOffset = 0
for i in range(0,cabMetaData["NoOfFiles"]):
fileDetails = {}
fileDetails["Size"] = struct.unpack("<L",stream[baseOffset+internalOffset:][:4])[0]
fileDetails["UnpackedStartOffset"] = struct.unpack("<L",stream[baseOffset+internalOffset+4:][:4])[0]
fileDetails["FolderIndex"] = struct.unpack("<H",stream[baseOffset+internalOffset+8:][:2])[0]
fileDetails["Date"] = struct.unpack("<H",stream[baseOffset+internalOffset+10:][:2])[0]
fileDetails["Time"] = struct.unpack("<H",stream[baseOffset+internalOffset+12:][:2])[0]
fileDetails["Attrib"] = struct.unpack("<H",stream[baseOffset+internalOffset+14:][:2])[0]
fileName =''
for j in range(0,len(stream)):
if(chr(stream[baseOffset+internalOffset+16 +j])!='\x00'):
fileName +=chr(stream[baseOffset+internalOffset+16 +j])
else:
break
internalOffset += 16+j+1
cabMetaData["Files"][fileName] = (fileDetails.copy())
eofiles = baseOffset + internalOffset
except Exception as e:
print(e)
pass
print(cabMetaData["CabSize"])
return cabMetaData
def GetFileSize(url):
resp = requests.head(url)
return int(resp.headers["Content-Length"])
def GetCABHeader(url):
global totalByteArray
size = GetFileSize(url)
newSize ="bytes=0-"+ str(int(0.01*size))
totalByteArray = b''
cabHeader= requests.get(url,headers={"Range":newSize},stream=True)
for chunk in cabHeader.iter_content(chunk_size=1024):
totalByteArray += chunk
def DownloadInfFile(baseUrl,InfFileData,InfFileName):
global totalByteArray,eofiles
if(not os.path.exists("infs")):
os.mkdir("infs")
baseCabName = baseUrl[baseUrl.rfind("/"):]
baseCabName = baseCabName.replace(".","_")
if(not os.path.exists("infs\\" + baseCabName)):
os.mkdir("infs\\"+baseCabName)
fileBytes = b''
newRange = "bytes=" + str(eofiles+InfFileData["UnpackedStartOffset"] ) + "-" + str(eofiles+InfFileData["UnpackedStartOffset"]+InfFileData["Size"] )
data = requests.get(baseUrl,headers={"Range":newRange},stream=True)
with open("infs\\"+baseCabName +"\\" + InfFileName ,"wb") as f:
for chunk in data.iter_content(chunk_size=1024):
fileBytes +=chunk
f.write(fileBytes)
f.flush()
print("Saved File " + InfFileName)
pass
def main(url):
GetCABHeader(url)
cabMetaData = GetCabMetaData(totalByteArray)
for fileName,data in cabMetaData["Files"].items():
if(fileName.endswith(".txt")):
DownloadInfFile(url,data,fileName)
main("http://path-to-some-cabinet.cab")
All the file details are correct. I have verified them.
Any guidance will be appreciated. Am I doing it wrong? Another way perhaps?
P.S : Already Looked into This Post
First, the data in the CAB is raw deflate, not zlib-wrapped deflate. So you need to ask zlib's inflate() to decode raw deflate with a negative windowBits value on initialization.
Second, the CAB format does not exactly use standard deflate, in that the 32K sliding window dictionary carries from one block to the next. You'd need to use inflateSetDictionary() to set the dictionary at the start of each block using the last 32K decompressed from the last block.
So I created this Folder C:\TempFiles to test run the following code snippet
Inside this folder i had two files -> nd1.txt, nd2.txt and a folder C:\TempFiles\Temp2, inside which i had only one file nd3.txt
Now when i execute this code:-
import os,file,storage
database = file.dictionary()
tools = storage.misc()
lui = -1 # last used file index
fileIndex = 1
def sendWord(wrd, findex): # where findex is the file index
global lui
if findex!=lui:
tools.refreshRecentList()
lui = findex
if tools.mustIgnore(wrd)==0 and tools.toRecentList(wrd)==1:
database.addWord(wrd,findex) # else there's no point adding the word to the database, because its either trivial, or has recently been added
def showPostingsList():
print("\nPOSTING's LIST")
database.display()
def parseFile(nfile, findex):
for line in nfile:
pl = line.split()
for word in pl:
sendWord(word.lower(),findex)
def parseDirectory(dirname):
global fileIndex
for root,dirs,files in os.walk(dirname):
for name in dirs:
parseDirectory(os.path.join(root,name))
for filename in files:
nf = open(os.path.join(root,filename),'r')
parseFile(nf,fileIndex)
print(" --> "+ nf.name)
fileIndex+=1
nf.close()
def main():
dirname = input("Enter the base directory :-\n")
print("\nParsing Files...")
parseDirectory(dirname)
print("\nPostings List has Been successfully created.\n",database.entries()," word(s) sent to database")
choice = ""
while choice!='y' and choice!='n':
choice = str(input("View List?\n(Y)es\n(N)o\n -> ")).lower()
if choice!='y' and choice!='n':
print("Invalid Entry. Re-enter\n")
if choice=='y':
showPostingsList()
main()
Now I should Traverse the three files only once each, and i put a print(filename) to test that, but apparently I am traversing the inside folder twice:-
Enter the base directory :-
C:\TempFiles
Parsing Files...
--> C:\TempFiles\Temp2\nd3.txt
--> C:\TempFiles\nd1.txt
--> C:\TempFiles\nd2.txt
--> C:\TempFiles\Temp2\nd3.txt
Postings List has Been successfully created.
34 word(s) sent to database
View List?
(Y)es
(N)o
-> n
Can Anyone tell me how to modify the os.path.walk() as such to avoid the error
Its not that my output is incorrect, but its traversing over one entire folder twice, and that's not very efficient.
Your issue isn't specific to Python 3, it's how os.walk() works - iterating already does the recursion to subfolders, so you can take out your recursive call:
def parseDirectory(dirname):
global fileIndex
for root,dirs,files in os.walk(dirname):
for filename in files:
nf = open(os.path.join(root,filename),'r')
parseFile(nf,fileIndex)
print(" --> "+ nf.name)
fileIndex+=1
nf.close()
By calling parseDirectory() for the dirs, you were starting another, independant walk of your only subfolder.