I am attempting to sort a dataframe by a column called 'GameId', which are currently of type string and when I attempt to sort the result is unexpected. I have tried the following but still return a type string.
TEST['GameId'] = TEST['GameId'].astype(int)
type('GameId')
One way to make the data life easier is using dataclasses!
from dataclasses import dataclass
# here will will be calling the dataclass decorator to send hints for data type!
#dataclass
class Columns:
channel_id : int
frequency_hz : int
power_dBmV : float
name : str
# this class will call the data class to organise the data as data.frequency data.power_dBmV etc
class RadioChannel:
radio_values = ['channel_id', 'frequency', 'power_dBmV']
def __init__(self, data): # self is 'this' but for python, it just means that you mean to reference 'this' or self instance
self.data = data # this instances data is called data here
data = Columns(channel_id=data[0], frequency=data[1], power_dBmv=data[4], name=data[3]) # now we give data var a val!
def present_data(self):
# this is optional class method btw
from rich.console import Console
from rich.table import Table
console = Console()
table = Table(title="My Radio Channels")
for item in self.radio_values:
table.add_column(item)
table.add_row(data.channel_id, data.frequency_hz, data.power_dBmv)
console.print(table)
# ignore this if its confusing
# now inside your functional part of your script
if __name__ == '__main__':
myData = []
# calling an imaginary file here to read
with open("my_radio_data_file", 'r') as myfile:
mylines = myfile.readlines()
for line in myline:
myData.append(line)
myfile.close()
#my data would look like a string ["value", value, 00, 0.0, "hello joe, from: world"]
ch1 = radioChannel(data=myData[0])
ch1.present_data()
This way you can just call the class object on each line of a data file. and print it to see if it lines up. once you get the hang of it, it starts to get fun.
I used rich console here, but it works well with pandas and normal dataframes!
dataclasses help the interpreter find its way with type hints and class structure.
Good Luck and have fun!
I have a file with non-classic formatting so I need to use the spark.DataFrameReader (spark.read.csv) on the raw file directly so that I can set the appropriate parsing configurations.
How can I do this?
You'll want to follow the methodology over here. Strongly recommend using unit test based methods to iterate on your code to recover the file contents.
Your compute function code will look like:
from transforms.api import transform, Output, Input
from transforms.verbs.dataframes import union_many
def read_files(spark_session, paths):
parsed_dfs = []
for file_name in paths:
parsed_df = spark_session.read.option("header", "true").csv(file_name)
parsed_dfs += [parsed_df]
output_df = union_many(*parsed_dfs)
return output_df
#transform(
the_output=Output("ri.foundry.main.dataset.my-awesome-output"),
the_input=Input("ri.foundry.main.dataset.my-awesome-input"),
)
def my_compute_function(the_input, the_output, ctx):
session = ctx.spark_session
input_filesystem = the_input.filesystem()
hadoop_path = input_filesystem.hadoop_path
files = input_filesystem.ls('**/*.csv.gz').map(lambda file_name: hadoop_path + file_name)
output_df = read_files(session, files)
the_output.write_dataframe(output_df)
I try to figure out how to get nested data as dictionary/property from yaml file.
The code below works if I provide the function with only one level.
example :
result = parse_yaml_file(config_yaml_file, 'section')
but fails if I try something like :
result = parse_yaml_file(yaml_file, 'section.sub-section')
or
result = parse_yaml_file(yaml_file, '[\'section\'][\'sub-section\']')
python3 code :
def parse_yaml_file(yml_file, section):
print('section : ' + section)
data_dict = {}
try:
with open(yml_file) as f:
data_dict = (yaml.load(f))
except (FileNotFoundError, IOError):
exit_with_error('Issue finding/opening ' + yml_file)
if not section:
return data_dict
else:
return data_dict.get(section)
result = parse_yaml_file(yaml_file, 'section.sub-section.property')
print(json.dumps(result, indent=4))
Is it possible to parse only on part/section of the yaml file ?
Or just retrieve one sub-section/property from the parsed result ?
I know I can get it from the dictionary like :
data_dict['section']['sub-section']['property']
but I want it to be flexible, and not hardcoded since the data to grab is provided as argument to the function.
Thanks a lot for your help.
You could try using a library to help search the parsed yaml file e.g. dpath
https://pypi.org/project/dpath/
import yaml
import dpath.util
def parse_yaml(yml_file, section):
with open(yml_file,'r') as f:
data_dict = yaml.load(f)
return dpath.util.search(data_dict,section)
parse_yaml('file.yml','section/sub-section')
I am reading an XML to transfer its attributes to other XML file with the same source. However something is wrong in the loop as I am overwriting the first element of the dictionary with the newest one.
The XMl looks like this:
<sxml locale="en-US" version="1.0" segtype="sentence" options="eol=unix;" createdby="AP 2.24.1" datatype="regexp" targetlocale="DE">
<family blockId="1" mat="33" type="freeOS" seccion="2" datatype="html" subtype="BSD"><section sectionId="1">
<product>FreeBSD</product>
</section></family>
<family blockId="2" mat="32" type="privative" seccion="3" datatype="html" subtype="commercial"><section sectionId="1">
<product>Windows</product><ws> </ws>
</section><section sectionId="2">
<product>Sistema operativo.</product>
</section></family>
</sxml>
And I want to get the attributes: "mat", "seccion", "type" and "subtype".
My code is:
from lxml import etree as et
from pathlib import Path
def add_attributes(files_path_str, proc_path_str, attributes):
"""
Adds the attributes to the frequent files.
"""
product_path = Path(files_path_str)
proc_files = Path(unk_path_str).glob('*.sxml')
dict_notes_src = dict()
list_src_sxml_files = full_sxml_path.glob('**/*.sxml')
for sxml_file in list_full_sxml_files:
xml_parser = et.parse(str(sxml_file))
root_xml = xml_parser.getroot()
print(sxml_file)
dict_notes_src_temp = __generate_notes_product_dict(root_xml, attributes)
dict_notes_src = {**dict_notes_src, **dict_notes_src_temp}
#It is the part where I copy the attributes to the processed files. The bug is not found in this part.
#The bug is somewhere in the generation of the dictionary
#for proc_file in list_proc_sxml_files:
# xml_parser = et.parse(str(unk_file))
# root_unk_xml = xml_parser.getroot()
# tree = __add_notes_to_unk(root_unk_xml, dict_notes_src)
# tree.write(str(unk_file), encoding='utf-8', pretty_print=True)
def __generate_notes_product_dict(root_xml, attributes):
"""
Internal method to process the xml file to get a dictionary product-note.
"""
translatable_elements = root_xml.xpath('family')
notes_product = dict()
dict_values = dict()
for element in translatable_elements:
product_elements = element.xpath('./section/product')
list_attrib_values = []
print(element.tag, element.attrib)
#satt_note = element.attrib['satt_note']
# List comprehension fails if there is a segment without an expected attribute.
#list_attrib_values = [element.attrib[attribute] for attribute in attributes]
# Checks if there are attributes that does not exist in the Full WordFast file.
# If that is the case ignores them and adds a None value.
for attribute in attributes:
try:
list_attrib_values.append(element.attrib[attribute])
print('Reading the attributes. {} : {}'.format(attribute, element.attrib[attribute]))
logging.debug('__generate_notes_product_dict: Add values of the attributes {}: {}'.format(
attribute, element.attrib[attribute]))
except KeyError:
list_attrib_values.append(None)
if len(product_elements) > 0:
for product_element in product_elements:
#product_element = element.xpath('./segment/product')[0]
product_str = str(et.tostring(product_element), 'utf-8')
# Create the string of the content of the product element.
product_str = ' '.join(product_str.split())
if list_attrib_values is not None:
if product_str not in notes_product:
# Generate a dictionary with the product text as key.
#notes_product[product_str] = satt_note
print(product_str)
for attribute in attributes:
try:
print(element.tag, element.attrib)
dict_values[attribute] = element.attrib[attribute]
except KeyError:
dict_values[attribute] = None
#for attribute, value in zip(attributes, list_attrib_values):
# if value is not None:
# print ('Adding the values {}: {}'.format(attribute, value))
# dict_values[attribute] = value
attrib_product[product_str] = dict_values
return attrib_product
add_attributes(folder_where_is_stored_xml, folder_where_save_xml,["mat", "seccion", "type", "subtype"]
It return a dictionary that all the products have the attributes of the last family.
I've been debugging the code and it looks like the when I running attrib_product[product_str] = dict_values is looping through all the values of the dict_values and stores only the last one.
Any ideas where am I doing wrong? I am not able to see why it is happening.
I need to fill pdf form in batch, so tried to write a python code to do it for me from a csv file. I used second answer in this question and it fills the forms fine, however when I open the filled forms the answers does not show unless the corresponding field is selected. Also the answers does not show when the form is printed. I looked into PyPDF2 documents to see if I can flatten the generated forms but this features has not been implemented yet even though has been asked for about a year ago. My preference is not to use pdftk so I can compile the script without the need for more dependency. When using the original code in the mentioned question, some fields show in the print and some doesn't which makes me confused on how they're working. Any help is appreciated.
Here's the code.
# -*- coding: utf-8 -*-
from collections import OrderedDict
from PyPDF2 import PdfFileWriter, PdfFileReader
def _getFields(obj, tree=None, retval=None, fileobj=None):
"""
Extracts field data if this PDF contains interactive form fields.
The *tree* and *retval* parameters are for recursive use.
:param fileobj: A file object (usually a text file) to write
a report to on all interactive form fields found.
:return: A dictionary where each key is a field name, and each
value is a :class:`Field<PyPDF2.generic.Field>` object. By
default, the mapping name is used for keys.
:rtype: dict, or ``None`` if form data could not be located.
"""
fieldAttributes = {'/FT': 'Field Type', '/Parent': 'Parent', '/T': 'Field Name', '/TU': 'Alternate Field Name',
'/TM': 'Mapping Name', '/Ff': 'Field Flags', '/V': 'Value', '/DV': 'Default Value'}
if retval is None:
retval = {} #OrderedDict()
catalog = obj.trailer["/Root"]
# get the AcroForm tree
if "/AcroForm" in catalog:
tree = catalog["/AcroForm"]
else:
return None
if tree is None:
return retval
obj._checkKids(tree, retval, fileobj)
for attr in fieldAttributes:
if attr in tree:
# Tree is a field
obj._buildField(tree, retval, fileobj, fieldAttributes)
break
if "/Fields" in tree:
fields = tree["/Fields"]
for f in fields:
field = f.getObject()
obj._buildField(field, retval, fileobj, fieldAttributes)
return retval
def get_form_fields(infile):
infile = PdfFileReader(open(infile, 'rb'))
fields = _getFields(infile)
return {k: v.get('/V', '') for k, v in fields.items()}
def update_form_values(infile, outfile, newvals=None):
pdf = PdfFileReader(open(infile, 'rb'))
writer = PdfFileWriter()
for i in range(pdf.getNumPages()):
page = pdf.getPage(i)
try:
if newvals:
writer.updatePageFormFieldValues(page, newvals)
else:
writer.updatePageFormFieldValues(page,
{k: f'#{i} {k}={v}'
for i, (k, v) in
enumerate(get_form_fields(infile).items())
})
writer.addPage(page)
except Exception as e:
print(repr(e))
writer.addPage(page)
with open(outfile, 'wb') as out:
writer.write(out)
if __name__ == '__main__':
import csv
import os
from glob import glob
cwd=os.getcwd()
outdir=os.path.join(cwd,'output')
csv_file_name=os.path.join(cwd,'formData.csv')
pdf_file_name=glob(os.path.join(cwd,'*.pdf'))[0]
if not pdf_file_name:
print('No pdf file found')
if not os.path.isdir(outdir):
os.mkdir(outdir)
if not os.path.isfile(csv_file_name):
fields=get_form_fields(pdf_file_name)
with open(csv_file_name,'w',newline='') as csv_file:
csvwriter=csv.writer(csv_file,delimiter=',')
csvwriter.writerow(['user label'])
csvwriter.writerow(['fields']+list(fields.keys()))
csvwriter.writerow(['Mr. X']+list(fields.values()))
else:
with open(csv_file_name,'r',newline='') as csv_file:
csvreader=csv.reader(csv_file,delimiter=',')
csvdata=list(csvreader)
fields=csvdata[1][1:]
for frmi in csvdata[2:]:
frmdict=dict(zip(fields,frmi[1:]))
outfile=os.path.join(outdir,frmi[0]+'.pdf')
update_form_values(pdf_file_name, outfile,frmdict)
I had the same issue and apparently adding the "/NeedsAppearance" attribute to the PdfWriter object of the AcroForm fixed the problem (see https://github.com/mstamy2/PyPDF2/issues/355). With much help from ademidun (https://github.com/ademidun), I was able to populate a pdf form and have the values of the fields show properly. The following is an example:
from PyPDF2 import PdfFileReader, PdfFileWriter
from PyPDF2.generic import BooleanObject, NameObject, IndirectObject
def set_need_appearances_writer(writer):
# See 12.7.2 and 7.7.2 for more information:
# http://www.adobe.com/content/dam/acom/en/devnet/acrobat/
# pdfs/PDF32000_2008.pdf
try:
catalog = writer._root_object
# get the AcroForm tree and add "/NeedAppearances attribute
if "/AcroForm" not in catalog:
writer._root_object.update(
{
NameObject("/AcroForm"): IndirectObject(
len(writer._objects), 0, writer
)
}
)
need_appearances = NameObject("/NeedAppearances")
writer._root_object["/AcroForm"][need_appearances] = BooleanObject(True)
return writer
except Exception as e:
print("set_need_appearances_writer() catch : ", repr(e))
return writer
reader = PdfFileReader("myInputPdf.pdf", strict=False)
if "/AcroForm" in reader.trailer["/Root"]:
reader.trailer["/Root"]["/AcroForm"].update(
{NameObject("/NeedAppearances"): BooleanObject(True)}
)
writer = PdfFileWriter()
set_need_appearances_writer(writer)
if "/AcroForm" in writer._root_object:
writer._root_object["/AcroForm"].update(
{NameObject("/NeedAppearances"): BooleanObject(True)}
)
field_dictionary = {"Field1": "Value1", "Field2": "Value2"}
writer.addPage(reader.getPage(0))
writer.updatePageFormFieldValues(writer.getPage(0), field_dictionary)
with open("myOutputPdf.pdf", "wb") as fp:
writer.write(fp)
The underlying reason form fields are not showing up after being filled in, is that the values are not being added to the stream. Adding "NeedAppearances" tells the PDF reader that it needs to update the appearance, in this case it needs to create a stream for each field value, but not all PDF readers will honor that, and the fields may still look blank or have the default values.
The best solution to make sure the fields are updated for any reader is to create a stream for each field and add it to the field's XObject.
Here is an example solution for single line text fields. It also encodes the stream, updates the default value, and sets the fields to read only, which are all optional.
# Example data.
data = {
"field_name": "some value"
}
# Get template.
template = PdfReader("template-form.pdf", strict=False)
# Initialize writer.
writer = PdfWriter()
# Add the template page.
writer.add_page(template.pages[0])
# Get page annotations.
page_annotations = writer.pages[0][PageAttributes.ANNOTS]
# Loop through page annotations (fields).
for index in range(len(page_annotations)): # type: ignore
# Get annotation object.
annotation = page_annotations[index].get_object() # type: ignore
# Get existing values needed to create the new stream and update the field.
field = annotation.get(NameObject("/T"))
new_value = data.get(field, 'N/A')
ap = annotation.get(AnnotationDictionaryAttributes.AP)
x_object = ap.get(NameObject("/N")).get_object()
font = annotation.get(InteractiveFormDictEntries.DA)
rect = annotation.get(AnnotationDictionaryAttributes.Rect)
# Calculate the text position.
font_size = float(font.split(" ")[1])
w = round(float(rect[2] - rect[0] - 2), 2)
h = round(float(rect[3] - rect[1] - 2), 2)
text_position_h = h / 2 - font_size / 3 # approximation
# Create a new XObject stream.
new_stream = f'''
/Tx BMC
q
1 1 {w} {h} re W n
BT
{font}
2 {text_position_h} Td
({new_value}) Tj
ET
Q
EMC
'''
# Add Filter type to XObject.
x_object.update(
{
NameObject(StreamAttributes.FILTER): NameObject(FilterTypes.FLATE_DECODE)
}
)
# Update and encode XObject stream.
x_object._data = FlateDecode.encode(encode_pdfdocencoding(new_stream))
# Update annotation dictionary.
annotation.update(
{
# Update Value.
NameObject(FieldDictionaryAttributes.V): TextStringObject(
new_value
),
# Update Default Value.
NameObject(FieldDictionaryAttributes.DV): TextStringObject(
new_value
),
# Set Read Only flag.
NameObject(FieldDictionaryAttributes.Ff): NumberObject(
FieldFlag(1)
)
}
)
# Clone document root & metadata from template.
# This is required so that the document doesn't try to save before closing.
writer.clone_reader_document_root(template)
# write "output".
with open(f"output.pdf", "wb") as output_stream:
writer.write(output_stream) # type: ignore
Thanks to fidoriel and others from the discussion here: https://github.com/py-pdf/PyPDF2/issues/355.
This is what works for me on Python 3.8 and PyPDF4 (but I think it will work as well with PyPDF2):
#!/usr/bin/env python3
from PyPDF4.generic import NameObject
from PyPDF4.generic import TextStringObject
from PyPDF4.pdf import PdfFileReader
from PyPDF4.pdf import PdfFileWriter
import random
import sys
reader = PdfFileReader(sys.argv[1])
writer = PdfFileWriter()
# Try to "clone" the original one (note the library has cloneDocumentFromReader)
# but the render pdf is blank.
writer.appendPagesFromReader(reader)
writer._info = reader.trailer["/Info"]
reader_trailer = reader.trailer["/Root"]
writer._root_object.update(
{
key: reader_trailer[key]
for key in reader_trailer
if key in ("/AcroForm", "/Lang", "/MarkInfo")
}
)
page = writer.getPage(0)
params = {"Foo": "Bar"}
# Inspired by updatePageFormFieldValues but also handles checkboxes.
for annot in page["/Annots"]:
writer_annot = annot.getObject()
field = writer_annot["/T"]
if writer_annot["/FT"] == "/Btn":
value = params.get(field, random.getrandbits(1))
if value:
writer_annot.update(
{
NameObject("/AS"): NameObject("/On"),
NameObject("/V"): NameObject("/On"),
}
)
elif writer_annot["/FT"] == "/Tx":
value = params.get(field, field)
writer_annot.update(
{
NameObject("/V"): TextStringObject(value),
}
)
with open(sys.argv[2], "wb") as f:
writer.write(f)
This updates text fields and checkboxes.
I believe the key part is copying some parts from the original file:
reader_trailer = reader.trailer["/Root"]
writer._root_object.update(
{
key: reader_trailer[key]
for key in reader_trailer
if key in ("/AcroForm", "/Lang", "/MarkInfo")
}
)
Note: Please feel free to share this solution in other places. I consulted a lot of SO questions related to this topic.
What worked for me was to reopen with pdfrw
The following has worked for me for Adobe Reader, Acrobat, Skim, and Mac OS Preview:
pip install pdfrw
import pdfrw
pdf = pdfrw.PdfReader("<input_name>")
for page in pdf.pages:
annotations = page.get("/Annots")
if annotations:
for annotation in annotations:
annotation.update(pdfrw.PdfDict(AP=""))
pdf.Root.AcroForm.update(pdfrw.PdfDict(NeedAppearances=pdfrw.PdfObject('true')))
pdfrw.PdfWriter().write("<output_name>", pdf)
alepisa's answer was the closest to working for me (thank you, alepisa), but I just had to change one small section
elif writer_annot["/FT"] == "/Tx":
value = params.get(field, field)
writer_annot.update(
This was producing an output where my PDF had the desired fields updated based off the dictionary with field names and values I passed it, but every fillable field, whether I wanted them filled or not, was populated with the name of that fillable field. I changed the elif statement to the one below and everything worked like a charm!
elif writer_annot["/FT"] == "/Tx":
field_value = field_values.get(field_name, "")
writer_annot.update({NameObject("/V"): TextStringObject(field_value),
#This line below is just for formatting
NameObject("/DA"): TextStringObject("/Helv 0 Tf 0 g")})
This nested back into the rest of alepisa's script should work for anybody having issues with getting the output in Acrobat to show the values without clicking on the cell!