passing UTF-8 through a python script from a subprocess - python-3.x

In a python3 script, I setup UTF-8 output with:
sys.stdout = codecs.getwriter("utf-8")(sys.stdout.detach())
I then launch a subprocess with subprocess.Popen, and scan the results, trying to print a subset. The output is \x-escaped when it contains interesting characters. This is not what I want, I just want the UTF-8. The unwanted output is at the bottom.
process = subprocess.Popen([MVN, "-Ptrain-model"],
cwd=JPN_MODELS_DIR, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
print_interesting_info(process.stdout)
def print_interesting_info(out):
TRAINING_FLAG = False
for buffer_line in out:
#line = str(buffer_line, "utf8").strip()
line = str(buffer_line).strip()
if ("ERROR" in line):
print(line)
Unwanted output:
b'[ERROR] next segment length 109 > LONGEST_WORD input \xef\xbc\xa1\xef\xbc\x8e\xef\xbc\xa1\xef\xbd\x92\xef\xbd\x86\xef\xbd\x8f\xef\xbd\x8c\xef\xbd\x84\xef\xbd\x89\xef\xbc\x8c\xef\xbc\xa4\xef\xbd\x89\xef\xbd\x85\xef\xbc\xab\xef\xbd\x8f\xef\xbd\x8e\xef\xbd\x94\xef\xbd\x8f\xef\xbd\x92\xef\xbd\x8e\xef\xbd\x89\xef\xbd\x81\xef\xbd\x94\xef\xbd\x85\xef\xbd\x8e\xef\xbc\x8c\xef\xbd\x85\xef\xbd\x89\xef\xbd\x8e\xef\xbd\x96\xef\xbd\x85\xef\xbd\x92\xef\xbd\x8b\xef\xbd\x81\xef\xbd\x8e\xef\xbd\x8e\xef\xbd\x94\xef\xbd\x85\xef\xbd\x93\xef\xbc\xb0\xef\xbd\x92\xef\xbd\x8f\xef\xbd\x90\xef\xbd\x81\xef\xbd\x87\xef\xbd\x81\xef\xbd\x8e\xef\xbd\x84\xef\xbd\x81\xef\xbd\x8d\xef\xbd\x89\xef\xbd\x94\xef\xbd\x94\xef\xbd\x85\xef\xbd\x8c\xef\xbd\x84\xef\xbd\x85\xef\xbd\x92\xef\xbd\x93\xef\xbd\x94\xef\xbd\x81\xef\xbd\x84\xef\xbd\x94\xef\xbd\x92\xef\xbd\x8f\xef\xbd\x8d\xef\xbd\x89\xef\xbd\x93\xef\xbd\x83\xef\xbd\x88\xef\xbd\x85\xef\xbd\x8e\xef\xbd\x88\xef\xbd\x85\xef\xbd\x89\xef\xbd\x84\xef\xbd\x8e\xef\xbd\x89\xef\xbd\x93\xef\xbd\x83\xef\xbd\x88\xef\xbd\x85\xef\xbd\x8e\xef\xbc\xa1\xef\xbd\x92\xef\xbd\x89\xef\xbd\x93\xef\xbd\x94\xef\xbd\x8f\xef\xbd\x8b\xef\xbd\x92\xef\xbd\x81\xef\xbd\x94\xef\xbd\x89\xef\xbd\x85\xef\xbd\x89\xef\xbd\x8e\xef\xbd\x89\xef\xbd\x88\xef\xbd\x92\xef\xbd\x85\xef\xbd\x8d\xef\xbc\xab\xef\xbd\x81\xef\xbd\x8d\xef\xbd\x90\xef\xbd\x86\xef\xbd\x87\xef\xbd\x85\xef\xbd\x87\xef\xbd\x85\xef\xbd\x8e\xef\xbd\x84\xef\xbd\x81\xef\xbd\x93\xef\xbd\x83\xef\xbd\x88\xef\xbd\x92\xef\xbd\x89\xef\xbd\x93\xef\xbd\x94\xef\xbd\x8c\xef\xbd\x89\xef\xbd\x83\xef\xbd\x88\xef\xbd\x85\xef\xbc\xab\xef\xbd\x81\xef\xbd\x89\xef\xbd\x93\xef\xbd\x85\xef\xbd\x92\xef\xbd\x94\xef\xbd\x95\xef\xbd\x8d\xef\xbc\x8c\xef\xbc\xa2\xef\xbd\x95\xef\xbd\x84\xef\xbd\x81\xef\xbd\x90\xef\xbd\x85\xef\xbd\x93\xef\xbd\x94\xef\xbc\x8c\xef\xbc\x91\xef\xbc\x99\xef\xbc\x94\xef\xbc\x93\xef\xbc\x91\xef\xbc\x99\xef\xbc\x94\xef\xbc\x93\xef\xbc\x91\xef\xbc\x99\xef\xbc\x94\xef\xbc\x93\xef\xbc\x91\xef\xbc\x99\xef\xbc\x94\xef\xbc\x93\xe5\x8f\x82\xe7\x85\xa7\xe3\x80\x82\n'

process.stdout is an io.BufferedReader. It reads bytes, and bytes.__str__ is basically a repr of the byte string. You can wrap it in an io.TextIOWrapper:
import io
print_interesting_info(out):
TRAINING_FLAG = False
out = io.TextIOWrapper(out, 'utf-8')
for line in out:
line = line.strip()
if "ERROR" in line:
print(line)

Related

Converts strings of binary to binary

I have a text file and a would like to read it in binary so I can transform its content into hexadecimal characters.
Then, I need to replace '20' by '0' and '80', 'e2', '8f' by '1'.
This would create a string of 0 and 1 (basically binary).
Finally, I need to convert this binary string into ascii characters.
I'm almost finish but I struggle with the last part:
import binascii
import sys
bin_file = 'TheMessage.txt'
with open(bin_file, 'rb') as file:
file_content = file.read().hex()
file_content = file_content.replace('20', '0').replace('80', '1').replace('e2', '1').replace('8f', '1')
print(file_content)
text_bin = binascii.a2b_uu(file_content)
The last line produces an error (I do not fully understand strings/hex/binary interpretation in python):
Traceback (most recent call last):
File "binary_to_string.py", line 34, in <module>
text_bin = binascii.a2b_uu(file_content)
binascii.Error: Trailing garbage
Could you give me a hand?
I'm working on this file: blank_file
I think you're looking for something like this? Refer to comments for why I do what I did.
import binascii
import sys
bin_file = 'TheMessage.txt'
with open(bin_file, 'rb') as file:
file_content = file.read().hex()
file_content = file_content.replace('20', '0').replace('80', '1').replace('e2', '1').replace('8f', '1')
# First we must split the string into a list so we can get bytes easier.
bin_list = []
for i in range(0, len(file_content), 8): # 8 bits in a byte!
bin_list.append(file_content[i:i+8])
message = ""
for binary_value in bin_list:
binary_integer = int(binary_value, 2) # Convert the binary value to base2
ascii_character = chr(binary_integer) # Convert integer to ascii value
message+=ascii_character
print(message)
One thing I noticed while working with this is that using your solution/file, there are 2620 bits, and this does not divide into 8, so it can not properly become bytes.

Python: input saving multiline string

while True:
try:
line = input("paste:")
except EOFError:
break
f = open("notam_new.txt", "w+")
f.write(line)
f.close()
This code return only the last line of multi-line after Ctrl+D
I tried also:
notam = input("paste new notam: ")
f = open("notam_new.txt", "w+")
f.write(notam)
f.close()
getting only the first row.
Any ideas?
You're setting line in a loop, so every iteration you're just overwriting said line with the next one You need to accumulate your lines in a list (created before the while True) so you can keep track of all of them, and then write to the file in a loop. Plus you also need to add a newline as input() strips it.
lines = []
while True:
try:
lines.append(input("paste:"))
except EOFError:
break
with open("notam_new.txt", "w+") as f:
for line in lines:
f.write(line)
f.write('\n')

Python: read from STDIN unless a file is specified, how is it done?

I'm writing a Python script which expects a regex pattern and a file name and looks for that regex pattern within the file.
By default, the script requires a file to work on.
I want to change the script so by default it would take it's input from STDIN unless a file is specified (-f filename).
My code looks like so:
#!/usr/bin/env python3
# This Python script searches for lines matching regular expression -r (--regex) in file/s -f (--files).
import re
import argparse
#import sys
class colored:
CYAN = '\033[96m'
UNDERLINE = '\033[4m'
END = '\033[0m'
def main(regex, file, underline, color):
pattern = re.compile(regex)
try:
for i, line in enumerate(open(file, encoding="ascii")):
for match in re.finditer(pattern, line):
message = "Pattern {} was found on file: {} in line {}. The line is: ".format(regex, file, i+1)
if args.color and args.underline:
#message = "Pattern {} was found on file: {} in line {}. The line is: ".format(regex, file, i+1)
l = len(line)
print(message + colored.CYAN + line + colored.END, end="")
print(" " ,"^" * l)
break
if args.underline:
l = len(line)
print(message + line, end="")
print(" " ,"^" * l)
break
if args.color:
print(message + colored.CYAN + line + colored.END, end="")
break
if args.machine:
print("{}:{}:{}".format(file, i+1, line), end="")
break
else:
print(message + line, end="")
break
except FileNotFoundError:
print("File not found, please supply")
pass
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Python regex finder', epilog = './python_parser.py --regex [pattern] --files [file]')
requiredNamed = parser.add_argument_group('required named arguments')
requiredNamed.add_argument('-r', '--regex',
help='regex pattern', required=True)
parser.add_argument('-f', '--file',
help='file to search pattern inside')
parser.add_argument('-u', '--underline', action='store_true',
help='underline')
parser.add_argument('-c', '--color', action='store_true',
help='color')
parser.add_argument('-m', '--machine', action='store_true',
help='machine')
args = parser.parse_args()
main(args.regex, args.file, args.underline, args.color)
You can see how a run looks here.
I tried using the answer from this SO question, but getting the following error:
for i, line in enumerate(open(file, encoding="ascii")):
TypeError: expected str, bytes or os.PathLike object, not _io.TextIOWrapper
Edit #1:
This is the file:
Itai
# something
uuu
UuU
# Itai
# this is a test
this is a test without comment
sjhsg763
3989746
# ddd ksjdj #kkl
I get the above error when I supply no file.
Edit#2:
When I change the file argument to that:
parser.add_argument('-f', '--file',
help='file to search pattern inside',
default=sys.stdin,
type=argparse.FileType('r'),
nargs='?'
)
And then run the script like so:
~ echo Itai | ./python_parser.py -r "[a-z]" -m
Traceback (most recent call last):
File "./python_parser.py", line 59, in <module>
main(args.regex, args.file, args.underline, args.color)
File "./python_parser.py", line 16, in main
for i, line in enumerate(open(file, encoding="ascii")):
TypeError: expected str, bytes or os.PathLike object, not NoneType
➜ ~
args.file = tmpfile
which is a file in the same directory where the script runs.
What am I doing wrong?
You wrote this:
def main(regex, file, underline, color):
...
for i, line in enumerate(open(file, encoding="ascii")):
You have some confusion about whether file denotes a filename or an open file descriptor. You want it to be an open file descriptor, so you may pass in sys.stdin. That means main() should not attempt to open(), rather it should rely on the caller to pass in an already open file descriptor.
Pushing the responsibility for calling open() up into main() will let you assign file = sys.stdin by default, and then re-assign the result of open() if it turns out that a filename was specified.

Writing python scripts

I need to write a standalone program that would run on a python cmd. This program counts the number of characters in every line of HumptyDumpty.txt file, and outputs this to a new file.
Note that the new file needs to contain only the number of characters per line.
Here's my code:
import sys
infilename = sys.argv[1]
outfilename = sys.argv[2]
infile=open(infilename)
outfile=open(outfilename, 'w')
char_=0
for line in infile:
line.split()
char_= len(line.strip("\n"))
outfile.write(str(char_ ))
print(line,end='')
infile.close()
outfile.close()
The ouput file has only one line, the concatenation of xyz instead of
x
y
z
"\n" doesnt seem to be doing the trick. Any suggestions?
If you don't want to include the white space between the words then you should replace them with an empty string.
for line in infile:
nline = line.replace(" ", "")
nline = nline.strip("\n")
char= len(nline)
outfile.write(str(char))
outfile.write("\n")
print(line, end='')
print(char)

Piping output using subprocess.Popen and subprocess.PIPE [duplicate]

I know how to run a command using cmd = subprocess.Popen and then subprocess.communicate.
Most of the time I use a string tokenized with shlex.split as 'argv' argument for Popen.
Example with "ls -l":
import subprocess
import shlex
print subprocess.Popen(shlex.split(r'ls -l'), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0]
However, pipes seem not to work... For instance, the following example returns noting:
import subprocess
import shlex
print subprocess.Popen(shlex.split(r'ls -l | sed "s/a/b/g"'), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE).communicate()[0]
Can you tell me what I am doing wrong please?
Thx
I think you want to instantiate two separate Popen objects here, one for 'ls' and the other for 'sed'. You'll want to pass the first Popen object's stdout attribute as the stdin argument to the 2nd Popen object.
Example:
p1 = subprocess.Popen('ls ...', stdout=subprocess.PIPE)
p2 = subprocess.Popen('sed ...', stdin=p1.stdout, stdout=subprocess.PIPE)
print p2.communicate()
You can keep chaining this way if you have more commands:
p3 = subprocess.Popen('prog', stdin=p2.stdout, ...)
See the subprocess documentation for more info on how to work with subprocesses.
I've made a little function to help with the piping, hope it helps. It will chain Popens as needed.
from subprocess import Popen, PIPE
import shlex
def run(cmd):
"""Runs the given command locally and returns the output, err and exit_code."""
if "|" in cmd:
cmd_parts = cmd.split('|')
else:
cmd_parts = []
cmd_parts.append(cmd)
i = 0
p = {}
for cmd_part in cmd_parts:
cmd_part = cmd_part.strip()
if i == 0:
p[i]=Popen(shlex.split(cmd_part),stdin=None, stdout=PIPE, stderr=PIPE)
else:
p[i]=Popen(shlex.split(cmd_part),stdin=p[i-1].stdout, stdout=PIPE, stderr=PIPE)
i = i +1
(output, err) = p[i-1].communicate()
exit_code = p[0].wait()
return str(output), str(err), exit_code
output, err, exit_code = run("ls -lha /var/log | grep syslog | grep gz")
if exit_code != 0:
print "Output:"
print output
print "Error:"
print err
# Handle error here
else:
# Be happy :D
print output
shlex only splits up spaces according to the shell rules, but does not deal with pipes.
It should, however, work this way:
import subprocess
import shlex
sp_ls = subprocess.Popen(shlex.split(r'ls -l'), stdin = subprocess.PIPE, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
sp_sed = subprocess.Popen(shlex.split(r'sed "s/a/b/g"'), stdin = sp_ls.stdout, stdout = subprocess.PIPE, stderr = subprocess.PIPE)
sp_ls.stdin.close() # makes it similiar to /dev/null
output = sp_ls.communicate()[0] # which makes you ignore any errors.
print output
according to help(subprocess)'s
Replacing shell pipe line
-------------------------
output=`dmesg | grep hda`
==>
p1 = Popen(["dmesg"], stdout=PIPE)
p2 = Popen(["grep", "hda"], stdin=p1.stdout, stdout=PIPE)
output = p2.communicate()[0]
HTH
"""
Why don't you use shell
"""
def output_shell(line):
try:
shell_command = Popen(line, stdout=PIPE, stderr=PIPE, shell=True)
except OSError:
return None
except ValueError:
return None
(output, err) = shell_command.communicate()
shell_command.wait()
if shell_command.returncode != 0:
print "Shell command failed to execute"
return None
return str(output)
Thank #hernvnc, #glglgl, and #Jacques Gaudin for the answers. I fixed the code from #hernvnc. His version will cause hanging in some scenarios.
import shlex
from subprocess import PIPE
from subprocess import Popen
def run(cmd, input=None):
"""Runs the given command locally and returns the output, err and exit_code."""
if "|" in cmd:
cmd_parts = cmd.split('|')
else:
cmd_parts = []
cmd_parts.append(cmd)
i = 0
p = {}
for cmd_part in cmd_parts:
cmd_part = cmd_part.strip()
if i == 0:
if input:
p[i]=Popen(shlex.split(cmd_part),stdin=PIPE, stdout=PIPE, stderr=PIPE)
else:
p[i]=Popen(shlex.split(cmd_part),stdin=None, stdout=PIPE, stderr=PIPE)
else:
p[i]=Popen(shlex.split(cmd_part),stdin=p[i-1].stdout, stdout=PIPE, stderr=PIPE)
i = i +1
# close the stdin explicitly, otherwise, the following case will hang.
if input:
p[0].stdin.write(input)
p[0].stdin.close()
(output, err) = p[i-1].communicate()
exit_code = p[0].wait()
return str(output), str(err), exit_code
# test case below
inp = b'[ CMServer State ]\n\nnode node_ip instance state\n--------------------------------------------\n1 linux172 10.90.56.172 1 Primary\n2 linux173 10.90.56.173 2 Standby\n3 linux174 10.90.56.174 3 Standby\n\n[ ETCD State ]\n\nnode node_ip instance state\n--------------------------------------------------\n1 linux172 10.90.56.172 7001 StateFollower\n2 linux173 10.90.56.173 7002 StateLeader\n3 linux174 10.90.56.174 7003 StateFollower\n\n[ Cluster State ]\n\ncluster_state : Normal\nredistributing : No\nbalanced : No\ncurrent_az : AZ_ALL\n\n[ Datanode State ]\n\nnode node_ip instance state | node node_ip instance state | node node_ip instance state\n------------------------------------------------------------------------------------------------------------------------------------------------------------------------\n1 linux172 10.90.56.172 6001 P Standby Normal | 2 linux173 10.90.56.173 6002 S Primary Normal | 3 linux174 10.90.56.174 6003 S Standby Normal'
cmd = "grep -E 'Primary' | tail -1 | awk '{print $3}'"
run(cmd, input=inp)

Resources