The following code extracts the subtitle data from mkv files. I want the output in one list but it creates a list for each track. How can the code be changed to give one list containing all the tracks rather than a list for each track?
#!/usr/bin/env python3
import os
import re
import json
import subprocess
from itertools import chain
mkvmerge = "/usr/bin/mkvmerge"
keep_lang = "eng"
#############################################################################
def extract_subs(mkv_list):
subtitle_tracks = []
video_tracks = []
audio_tracks = []
extractList = []
for file in mkv_list:
### Commandline auguments for extracting info about the mkv file
command = [mkvmerge, "-i", "-F", "json", file]
# Ask mkvmerge for the json info
process = subprocess.run(command, capture_output=True, text=True, check=True)
stdout = process.stdout
### Process the json response
json__data = json.loads(stdout)
tracks = json__data.get('tracks', [])
### find audio and subtitle tracks
audio = []
subtitle = []
track_list = []
for track in tracks:
track['properties']['id'] = track['id']
if track['type'] == 'audio':
audio.append(track)
elif track['type'] == 'subtitles':
subtitle.append(track)
# filter out files that don't need processing
if len(audio) < 2 and len(subtitle) < 2:
pass
#print("\nNo extracted subs to process.", file)
subtitle_keep = list(filter(lambda a: a ['properties']['language']==keep_lang, subtitle))
for s in subtitle_keep:
track_list.append(f"Track #{s['id']}: " f"{s['properties'].get('language')}" f" - " f"{s['codec']}: " + file)
print (track_list)
output
['Track #2: eng - SubRip/SRT: /home/mp/torrents/test/Belfast (2021)/Belfast (2021).mkv', 'Track #3: eng - SubRip/SRT: /home/mp/torrents/test/Belfast (2021)/Belfast (2021).mkv']
['Track #2: eng - SubRip/SRT: /home/mp/torrents/test/The Rescue (2021)/The Rescue (2021).mkv']
desired output
['Track #2: eng - SubRip/SRT: /home/mp/torrents/test/Belfast (2021)/Belfast (2021).mkv', 'Track #3: eng - SubRip/SRT: /home/mp/torrents/test/Belfast (2021)/Belfast (2021).mkv', 'Track #2: eng - SubRip/SRT: /home/mp/torrents/test/The Rescue (2021)/The Rescue (2021).mkv']
It seems like that your variable track_list has the wrong scope.
Try moving it out of the scope of the for loop.
#!/usr/bin/env python3
import os
import re
import json
import subprocess
from itertools import chain
mkvmerge = "/usr/bin/mkvmerge"
keep_lang = "eng"
#############################################################################
def extract_subs(mkv_list):
subtitle_tracks = []
video_tracks = []
audio_tracks = []
extractList = []
track_list = []
for file in mkv_list:
### Commandline auguments for extracting info about the mkv file
command = [mkvmerge, "-i", "-F", "json", file]
# Ask mkvmerge for the json info
process = subprocess.run(command, capture_output=True, text=True, check=True)
stdout = process.stdout
### Process the json response
json__data = json.loads(stdout)
tracks = json__data.get('tracks', [])
### find audio and subtitle tracks
audio = []
subtitle = []
for track in tracks:
track['properties']['id'] = track['id']
if track['type'] == 'audio':
audio.append(track)
elif track['type'] == 'subtitles':
subtitle.append(track)
# filter out files that don't need processing
if len(audio) < 2 and len(subtitle) < 2:
pass
#print("\nNo extracted subs to process.", file)
subtitle_keep = list(filter(lambda a: a ['properties']['language']==keep_lang, subtitle))
for s in subtitle_keep:
track_list.append(f"Track #{s['id']}: " f"{s['properties'].get('language')}" f" - " f"{s['codec']}: " + file)
print (track_list)
Related
I have a csv file with URLs I'd like to extract data from, but my script currently only manages to get the last entry to append. This is the script:
import os
import glob
import time
from urllib.request import urlopen
import pandas as pd
import xml.etree.ElementTree as ET
count=0
files=glob.glob('./extract/isbnlist/Reihe*_isbn-dnb2.csv',recursive=True) #searches all files in folder
print(files)
for file in files:
if count==0:
csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
for row in csvfile['URL']:
print('row: ' + row)
with urlopen(str(row)) as response:
doc = ET.parse(response)
root = doc.getroot()
namespaces = { # Manually extracted from the XML file, but there could be code written to automatically do that.
"zs": "http://www.loc.gov/zing/srw/",
"": "http://www.loc.gov/MARC21/slim",
}
datafield_nodes_path = "./zs:records/zs:record/zs:recordData/record/datafield" # XPath
datafield_attribute_filters = [ #which fields to extract
{
"tag": "100", #author
"ind1": "1",
"ind2": " ",
}]
#datafield_attribute_filters = [] # Decomment this line to clear filters (and process each datafield node)
aut = []
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
if datafield_attribute_filters:
skip_node = True
for attr_dict in datafield_attribute_filters:
for k, v in attr_dict.items():
if datafield_node.get(k) != v:
break
else:
skip_node = False
break
if skip_node:
continue
for subfield_node in datafield_node.iterfind("./subfield[#code='a']", namespaces=namespaces):
aut.append(subfield_node.text) #this gets the author name and title
print(aut)
count+=1
and this is the csv file:
URL
0 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783960382850&recordSchema=MARC21-xml
1 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783963622106&recordSchema=MARC21-xml
2 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D-&recordSchema=MARC21-xml
3 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783806241280&recordSchema=MARC21-xml
4 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783890296005&recordSchema=MARC21-xml
5 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783110699111&recordSchema=MARC21-xml
6 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783110698930&recordSchema=MARC21-xml
7 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783110699104&recordSchema=MARC21-xml
8 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783963621093&recordSchema=MARC21-xml
9 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9783451716034&recordSchema=MARC21-xml
10 http://services.dnb.de/sru/dnb?version=1.1&operation=searchRetrieve&query=ISBN%3D9788791953514&recordSchema=MARC21-xml
When I execute the script, the output is:
['Schmidt, Horst']
but I need the other results as well. How can I do that?
Any help is appreciated.
EDIT: link to the full csv file on Pastebin the filename is: Reihe-21A51.csv_extract.csv_isbn-dnb2.csv
As #Tranbi pointed out, I had to move the aut=[] outside of the loop
it's now
for file in files:
if count==0: #to only go through the first file, instead of all files in the folder
csvfile = pd.read_csv(file, sep='\t', encoding='utf-8')
aut = []
instead of
aut = []
for datafield_node in root.iterfind(datafield_nodes_path, namespaces=namespaces):
I have a script I wrote in Python2 to encrypt files using Crypto.Cipher.ARC4
Now that Python2 is EOL for over a year now, I've been starting to move everything to Python3.
Is it possible to decrypt files encrypted with my script using Python3 (and vice versa)?
Here is my script:
#!/usr/bin/python
import os
from Crypto.Cipher import ARC4
key = "my_redacted_string"
dir = os.path.realpath(__file__)
dir = os.path.dirname(dir)
# https://stackoverflow.com/questions/4934806
files = os.listdir(dir)
os.chdir(dir)
script_name = __file__
script_name = script_name.split("/")[-1]
proceed = [1, "y", "yes",'1']
for f in files:
if f == script_name:
pass
else:
string = "Process file? : %s > " % f
answer = raw_input(string)
if answer in proceed:
filo = open(f) # filo == file object, file open
data = filo.read()
filo.close()
e = ARC4.new(key)
e = e.encrypt(data)
out_name = "%s.dat" % f
filo = open(out_name, 'w')
filo.write(e)
filo.close()
Here is a script I wrote to decrypt files encrypted with the above script:
#!/usr/bin/python
import os
from Crypto.Cipher import ARC4
key = "my_redacted_string"
dir = os.path.realpath(__file__)
dir = os.path.dirname(dir)
# https://stackoverflow.com/questions/4934806
files = os.listdir(dir)
os.chdir(dir)
script_name = __file__
script_name = script_name.split("/")[-1]
proceed = [1, "y", "yes",'1']
for f in files:
if f == script_name:
pass
else:
string = "Process file? : %s > " % f
answer = raw_input(string)
if answer in proceed:
filo = open(f) # filo == file object, file open
data = filo.read()
filo.close()
d = ARC4.new(key)
d = d.decrypt(data)
out_name = os.path.splitext(f)[0]
print out_name
filo = open(out_name, 'w')
filo.write(d)
filo.close()
I try to make everything cross-platform and incuded #!/usr/bin/python as a habit, but I am using 64-bit Windows 10 on my laptop (I have one Linux box, VMs, and VPSes using Linux and using this script client side so using Windows)
at the moment I am able to create one CSV file with all the content I get at once.
Now I would like to create a list where I have different names in it.
How can I produce for every functioncall a different CSV file name? I thought about looping a list but I just want a +1 iteration at each call. I thought about saving my state somehow and use it in next functioncall. Everytime I initialize my variable with 0 and so I don't get 1. I think I could do it with Python Function Parameter calls but I have no idea how to use it. Can someone give me a little tip or example? If there are better ideas (maybe my idea is totally bullshit), how to solve this, just help please.
The comments in the code shall represent my imagination.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from tenable.sc import SecurityCenter as SC
import os.path
import sys
import getpass
import csv
SC_HOST = '...'
def parse_entry(entry):
split_after_path = ''
ip = entry.get('ip', None)
pluginText = entry.get('pluginText', None)
if 'Path : ' in pluginText:
for line in pluginText.splitlines(0):
if 'Path : ' in line:
split_after_path_in_plugintext = line.split("Path : ",1)[1]
# place = ['place1', 'place2', 'place3', 'place4', 'place5']
# i = 0
# i = i+1
file_exists = os.path.isfile('testfile_path.csv')
# file_exists = os.path.isfile('testfile_path_'+place[i]+'.csv')
data = open('testfile_path.csv', 'a')
# data = open('testfile_path_'+place[i]+'.csv', 'a')
with data as csvfile:
header = ['IP Address', 'Path']
writer = csv.DictWriter(csvfile, lineterminator='\n', quoting=csv.QUOTE_NONNUMERIC, fieldnames=header)
if not file_exists:
writer.writeheader()
writer.writerow({'IP Address': ip, 'Path': split_after_path})
data.close()
def main():
sc_user = input('[<<] username: ')
sc_pass = getpass.getpass('[<<] password: ')
sc = SC(SC_HOST)
sc.login(sc_user, sc_pass)
# Query API for data
# asset = [12,13,14,25,29]
# i = 0
# assetid = asset[i]
# vuln = sc.analysis.vulns(('pluginID', '=', '25072')('asset','=','assetid'))
# i = i+1
vuln = sc.analysis.vulns(('pluginID', '=', '25072'),('asset','=','11'))
for entry in vuln:
parse_entry(entry)
sc.logout()
return 0
if __name__ == '__main__':
sys.exit(main())
The simplest and most obvious solution is to pass the full file path to your parse_entry function, ie:
def parse_entry(entry, filepath):
# ...
if 'Path : ' in pluginText:
for line in pluginText.splitlines(0):
if 'Path : ' in line:
# ...
file_exists = os.path.isfile(filepath)
with open(filepath, 'a') as csvfile:
# ...
Then in main() use enumerate() to build sequential filenames:
def main():
# ...
for i, entry in enumerate(vuln):
path = "'testfile_path{}.csv".format(i)
parse_entry(entry, path)
You can use a function attribute to keep track of the number of times the function has been called.
def parse_entry(entry):
parse_entry.i += 1
# outside the function you have to initialize the attribute
parse_entry.i = 0
Or you can look at other ways to initialize the function attribute in this post.
Alternatively, you can use glob to get the current number of files.
from glob import glob
i = len(glob('testfile_path_*.csv'))
Under Linux / bash, how can I obtain a plain-text representation of a directory of its contents? (Note that by "plain-text" here I mean "UTF-8").
In other words, how could I "pack" or "archive" a directory (with contents - including binary files) as a plain text file - such that I could "unpack" it later, and obtain the same directory with its contents?
I was interested in this for a while, and I think I finally managed to cook up a script that works in both Python 2.7 and 3.4 -- however, I'd still like to know if there is something else that does the same. Here it is as a Gist (with some more comments):
https://gist.github.com/anonymous/1a68bf2c9134fd5312219c8f68713632
Otherwise, I'm posting a slightly abridged version here (below) for reference.
The usage is: to archive/pack into a .json text file:
python archdir2text-json.py -a /tmp > myarchdir.json
... and to unpack from the .json text file into the current (calling) directory:
python archdir2text-json.py -u myarchdir.json
Binary files are handled as base64.
Here is the script:
archdir2text-json.py
#!/usr/bin/env python
import pprint, inspect
import argparse
import os
import stat
import errno
import base64
import codecs
class SmartDescriptionFormatter(argparse.RawDescriptionHelpFormatter):
def _fill_text(self, text, width, indent):
if text.startswith('R|'):
paragraphs = text[2:].splitlines()
rebroken = [argparse._textwrap.wrap(tpar, width) for tpar in paragraphs]
rebrokenstr = []
for tlinearr in rebroken:
if (len(tlinearr) == 0):
rebrokenstr.append("")
else:
for tlinepiece in tlinearr:
rebrokenstr.append(tlinepiece)
return '\n'.join(rebrokenstr)
return argparse.RawDescriptionHelpFormatter._fill_text(self, text, width, indent)
textchars = bytearray({7,8,9,10,12,13,27} | set(range(0x20, 0x100)) - {0x7f})
is_binary_string = lambda bytes: bool(bytes.translate(None, textchars))
cwd = os.getcwd()
if os.name == 'nt':
import win32api, win32con
def folder_is_hidden(p):
if os.name== 'nt':
attribute = win32api.GetFileAttributes(p)
return attribute & (win32con.FILE_ATTRIBUTE_HIDDEN | win32con.FILE_ATTRIBUTE_SYSTEM)
else:
return os.path.basename(p).startswith('.') #linux-osx
def path_hierarchy(path):
hierarchy = {
'type': 'folder',
'name': os.path.basename(path),
'path': path,
}
try:
cleared_contents = [contents
for contents in os.listdir(path)
if not(
os.path.isdir(os.path.join(path, contents))
and
folder_is_hidden(os.path.join(path, contents))
)]
hierarchy['children'] = [
path_hierarchy(os.path.join(path, contents))
for contents in cleared_contents
]
except OSError as e:
if e.errno == errno.ENOTDIR:
hierarchy['type'] = 'file'
else:
hierarchy['type'] += " " + str(e)
if hierarchy['type'] == 'file':
isfifo = stat.S_ISFIFO(os.stat(hierarchy['path']).st_mode)
if isfifo:
ftype = "fifo"
else:
try:
data = open(hierarchy['path'], 'rb').read()
ftype = "bin" if is_binary_string(data) else "txt"
if (ftype == "txt"):
hierarchy['content'] = data.decode("utf-8")
else:
hierarchy['content'] = base64.b64encode(data).decode("utf-8")
except Exception as e:
ftype = str(e)
hierarchy['ftype'] = ftype
return hierarchy
def recurse_unpack(inobj, relpath=""):
if (inobj['type'] == "folder"):
rpname = relpath + inobj['name']
sys.stderr.write("folder name: " + rpname + os.linesep);
os.mkdir(rpname)
for tchild in inobj['children']:
recurse_unpack(tchild, relpath=relpath+inobj['name']+os.sep)
elif (inobj['type'] == "file"):
rfname = relpath + inobj['name']
sys.stderr.write("file name: " + rfname + os.linesep)
if inobj['ftype'] == "txt":
with codecs.open(rfname, "w", "utf-8") as text_file:
text_file.write(inobj['content'])
elif inobj['ftype'] == "bin":
with open(rfname, "wb") as bin_file:
bin_file.write(base64.b64decode(inobj['content']))
if __name__ == '__main__':
import json
import sys
parser = argparse.ArgumentParser(formatter_class=SmartDescriptionFormatter, description="""R|Command-line App that packs/archives (and vice-versa) a directory to a plain-text .json file; should work w/ both Python 2.7 and 3.4
see full help text in https://gist.github.com/anonymous/1a68bf2c9134fd5312219c8f68713632""")
parser.add_argument('input_paths', type=str, nargs='*', default=['.'],
help='Paths to files/directories to include in the archive; or path to .json archive file')
group = parser.add_mutually_exclusive_group(required=True)
group.add_argument('-a', '--archive', action='store_true', help="Interpret input_paths as paths to files/directories, and archive them to a .json file (output to stdout)")
group.add_argument('-u', '--unpack', action='store_true', help="Interpret input_paths as path to an archive .json file, and unpack it in the current directory")
args = parser.parse_args()
if (args.archive):
valid_input_paths = []
for p in args.input_paths:
if os.path.isdir(p) or os.path.exists(p):
valid_input_paths.append(p)
else:
sys.stderr.write("Ignoring invalid input path: " + p + os.linesep)
sys.stderr.write("Encoding input path(s): " + str(valid_input_paths) + os.linesep)
path_hier_arr = [path_hierarchy(vp) for vp in valid_input_paths]
outjson = json.dumps(path_hier_arr, indent=2, sort_keys=True, separators=(',', ': '))
print(outjson)
elif (args.unpack):
valid_input_paths = []
for p in args.input_paths:
if os.path.isdir(p) or os.path.exists(p):
valid_input_paths.append(p)
else:
sys.stderr.write("Ignoring invalid input path: " + p + os.linesep)
for vp in valid_input_paths:
with open(vp) as data_file:
data = json.load(data_file)
for datachunk in data:
recurse_unpack(datachunk)
I am a CS major at the University of Alabama, we have a project in our python class and I am stuck...probably for some stupid reason, but I cant seem to find the answer.
here is the link to the project, as it would be a pain to try and explain on here.
http://beastie.cs.ua.edu/cs150/projects/project1.html
here is my code:
import sys
from scanner import scan
def clInput():
#Gets command line input
log1 = sys.argv[1]
log2 = sys.argv[2]
name = sys.argv[3]
if len(sys.argv) != 4:
print('Incorrect number of arguments, should be 3')
sys.exit(1)
return log1,log2,name
def openFiles(log1,log2):
#Opens sys.argv[1]&[2] for reading
f1 = open(log1, 'r')
f2 = open(log2, 'r')
return f1, f2
def merge(log1,log2):
#Merges parsed logs into list without '---'
log1Parse = [[]]
log2Parse = [[]]
log1Count = 0
log2Count = 0
for i in log1:
if i != ['---']:
log1Parse[log1Count].append(i)
else:
log1Count += 1
log1Parse.append([])
for i in log2:
if i != ['---']:
log2Parse[log2Count].append(i)
else:
log2Count += 1
log2Parse.append([])
return(log1Parse[0] + log2Parse[0] + log1Parse[1] + log2Parse[1])
def searchMerge(name,merged):
#Searches Merged list for sys.argv[3]
for i in range(len(merged)):
if (merged[i][1] == name):
print(merged[i][0],merged[i][1]," ".join(merged[i][2:]))
def main():
log1,log2,name = clInput()
f1,f2 = openFiles(log1,log2)
#Sets the contents of the two scanned files to variables
tokens1 = scan(f1)
tokens2 = scan(f2)
#Call to merge and search
merged = merge(tokens1,tokens2)
searchMerge(name,merged)
main()
ok. so heres the problem. We are to merge two lists together into a sorted master list, delimited at the ---'s
my two log files match the ones posted on the website i linked to above. This code works, however if there are more than two instances of the ---'s in each list, it will not jump to the next list to get the other tokens, and so forth. I have it working for two with the merge function. at the end of that function i return
return(log1Parse[0] + log2Parse[0] + log1Parse[1] + log2Parse[1])
but this only works for two instances of ---. Is there anyway i can change my return to look at all of the indexes instead of having to manually put in [0],[1],[2], etc.? I need it to delimit and merge for an arbitrary amount. Please help!!
p.s. disregard the noobness...im a novice, we all gotta start somewhere
p.p.s. - the from scanner import scan is a scanner i wrote to take in all of the tokens in a given list
so.py:
import sys
def main():
# check and load command line arguments
# your code
if len(sys.argv) != 4:
print('Incorrect number of arguments, should be 3')
sys.exit(1)
# open files using file io
# your code
f1 = open(log1, 'r')
f2 = open(log2, 'r')
# list comprehension to process and filter log files
l1 = [ x.strip().split(" ",2) for x in f1.readlines() if x.strip() != "---" ]
l2 = [ x.strip().split(" ",2) for x in f2.readlines() if x.strip() != "---" ]
f1.close()
f2.close()
sorted_merged_lists = sorted(l1 + l2)
results = [ x for x in sorted_merged_lists if x[1] == name ]
for result in results:
print result
main()
CLI:
$ python so.py log1.txt log2.txt Matt
['12:06:12', 'Matt', 'Logged In']
['13:30:07', 'Matt', 'Opened Terminal']
['15:02:00', 'Matt', 'Opened Evolution']
['15:31:16', 'Matt', 'Logged Out']
docs:
http://docs.python.org/release/3.0.1/tutorial/datastructures.html#list-comprehensions
http://docs.python.org/release/3.0.1/library/stdtypes.html?highlight=strip#str.strip
http://docs.python.org/release/3.0.1/library/stdtypes.html?highlight=split#str.split
http://docs.python.org/release/3.0.1/library/functions.html?highlight=sorted#sorted