upgrade Django to 1.11 - formset changes

upgrade Django to 1.11 - formset changes - python-3.5

I have problem with my app.
After upgrading Django to latest version - 1.11.4, I am getting the same error for three of my tests.
AssertionError: False is not true : The formset 'formset' in context 0 does not contain the non-form error 'Musi być podana co najmniej jedna stawka' (actual errors: ['Proszę wysłać 1 lub więcej formularzy.'])
I know that errors are in polish language but the fact is one, It was working with Django 1.9 and Django 1.10 but it is not with Django 1.11.
Could somebody tell me what was changed in Django 1.11 with forests??
I have read a lot of stuff about it and I have tried almost everything but old version is not working.
Maybe this formset doesn't see my error message and I have to make errors in another way after upgrading?
class BaseTaxRateInlineFormSet(UniqueFieldsFormSetMixin,
NotEmptyInlineFormSetMixin,
BaseInlineFormSet):
_unique_fields = ('valid_from',)
msg_at_least_one_required = __('Musi być podana co najmniej jedna stawka')
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.queryset = self.queryset.order_by('valid_from')
superclasses:
class NotEmptyInlineFormSetMixin(object):
"""
Prevents from deleting last not empty row (when other are empty)
"""
msg_at_least_one_required = __(
'Co najmniej jeden wiersz musi być wypełniony'
)
code_at_least_one_required = 'at_least_one_required'
def _form_is_empty(self, form):
return not len(list(filter(None, form.cleaned_data.values())))
def clean(self):
super().clean()
if any(self.errors):
return
forms_to_delete = 0
forms_empty = 0
for form in self.forms:
if self._should_delete_form(form):
forms_to_delete += 1
elif self._form_is_empty(form):
forms_empty += 1
if forms_to_delete + forms_empty == self.total_form_count():
raise ValidationError(
self.msg_at_least_one_required,
code=self.code_at_least_one_required
)
class UniqueFieldsFormSetMixin(object):
"""
Checks if fields (_unique_fields) have unique values in all forms
"""
_unique_fields = []
msg_field_not_unique = __('Wartość nie może się powtarzać')
code_field_not_unique = 'field_not_unique'
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
if 'unique_fields' in kwargs:
self._unique_fields = kwargs.pop('unique_fields', [])
def clean(self):
super().clean()
if self._unique_fields:
values = {field: set() for field in self._unique_fields}
for form in self.forms:
for field in self._unique_fields:
val = form.cleaned_data.get(field)
if val:
if val in values[field]:
form.add_error(
field,
ValidationError(
self.msg_field_not_unique,
code=self.code_field_not_unique
)
)
values[field].add(val)
and test:
def test_update_removing_last_row(self):
data = {
# One row for existing rates and one empty.
self.FORMSET_PREFIX + '-TOTAL_FORMS': 2,
self.FORMSET_PREFIX + '-INITIAL_FORMS': 1,
self.FORMSET_PREFIX + '-MAX_NUM_FORMS': 100
}
# First row exists
data = self._load_formset_data(
data, dict(
self.rate_data_1, DELETE='on',
**{'id': self.rate_pk_1, self.related_field: self.instance_pk}
), name=self.instance_name
)
self.client.force_login(self.superuser)
response = self.client.post(self.url_update, data=data)
# Can't remove last row
self.assertEqual(response.status_code, 200)
self.assertFormsetError(
response, 'formset', None, None,
self.base_formset_class.msg_at_least_one_required
)

The problem is with two lines of Django formset.
if not form.has_changed() and i >= self.initial_form_count():
empty_forms_count += 1
In this situation Django knows which of my forms are empty and not changed.
How to change my tests or maybe implementation of formset to have not fails in my testing machine?

Related

loop efficiency and performance impact calling api in python

Team:
My concern is on redundancy, efficient use of loops and best approach to get the desired result.
Usecase: get on call user name and create jira ticket with it.
below is my entire code and it runs fine for me. This is my very first OOP project.
Flow: I am calling two APIS (jira and pager api).
First calling pager api and getting who is oncall currently. Here am getting a list of nested dicts as response that am looping on.
Then calling jira api to create ticket with that above oncall user.
i want to learn to calculate Big0 and improve.
since this is my very first time can I get to see if there any problems or inefficiency or divergence from standard practices?
import requests
import json
import os
from jira import JIRA
from pdpyras import APISession
from collections import OrderedDict
JIRA_DICT_KEY = "JIRA"
JIRA_CONFIG = {'server': "https://jirasw.tom.com"}
JIRA_USER = os.environ['JIRA_USER']
JIRA_PW = os.environ['JIRA_PW']
PD_API_KEY = os.environ['PD_API_KEY']
USER_EMAIL = os.environ['USER_EMAIL']
class ZidFinder(object):
def __init__(self):
self.active_zid_errors = dict()
self.team_oncall_dict = dict()
self.onCall = self.duty_oncall()
self.jira = self.init_jira()
def init_jira(self):
jira = JIRA(options=JIRA_CONFIG, auth=(JIRA_USER, JIRA_PW))
return jira
def duty_oncall(self, *args):
session = APISession(PD_API_KEY, default_from=USER_EMAIL)
total = 1 #true or false
limit = 100 # this var is to pull limit records at a time.
teamnm = "Product SRE Team"
team_esp_name = "Product SRE Escalation Policy"
teamid = ""
teamesplcyid = ""
if args:
offset = args[0]
total_teams = args[1]
if offset <= total_teams:
print("\nfunc with args with new offset {} called\n".format(offset))
teams = session.get('/teams?limit={0}&total={1}&offset={2}'.format(limit,total,offset))
else:
print("Reached max teams, no more team records to pull")
return
else:
print("\nPull first set of {} teams as defined by limit var and loop more if team not found..\n".format(limit))
teams = session.get('/teams?limit={0}&total={1}'.format(limit,total))
if not teams.ok:
return
else:
tj = teams.json()
tjd = tj['teams']
print("\n")
for adict in tjd:
if not adict['name'] == teamnm:
continue
elif adict['name'] == teamnm:
teamid = adict['id']
print("Found team..\n",adict['name'], "id: {0}".format(teamid))
esclp = session.get('/escalation_policies?total={0}&team_ids%5B%5D={1}'.format(total,teamid))
if not esclp.ok:
print("Failed pulling Escalation polices for team '{}'".format(teamnm))
return
else:
ep = esclp.json()
epj = esclp.json()['escalation_policies']
if not epj:
print("Escalation polices for team '{}' not defined".format(teamnm))
return
else:
for adict in epj:
if not adict['summary'] == team_esp_name:
continue
else:
teamesplcyid = adict['id']
print("{} id: {}\n".format(team_esp_name, teamesplcyid))
oncalls = session.get('/oncalls?total={0}&escalation_policy_ids%5B%5D={1}'.format(total,teamesplcyid))
if not oncalls.ok:
print("Issue in getting oncalls")
return
else:
ocj = oncalls.json()['oncalls']
for adict in ocj:
if adict['escalation_level'] == 1 or adict['escalation_level'] == 2:
self.team_oncall_dict[adict['schedule']['summary']] = adict['user']['summary']
continue
if self.team_oncall_dict:
if len(self.team_oncall_dict) == 1:
print("\nOnly Primary onCall is defined")
print("\n",self.team_oncall_dict)
else:
print(" Primary and other calls defined")
print("\n",OrderedDict(self.team_oncall_dict),"\n")
return
else:
print("Calling with next offset as team was not found in the records pulled under limit..")
if tj['offset'] <= tj['total'] or tj['more'] == True:
setoffset = limit + tj['offset']
self.onCall(setoffset, tj['total'])
def create_jiras(self):
node = ["node1", "node2"]
zid_label = ["id90"]
labels = [node, zid_label]
print("Creating a ticket for node {} with description: {}".format(node, str(self.active_zid_errors[node])))
if self.msre_oncall_dict:
print("Current onCalls pulled from Duty, use them as assignee in creating jira tickets..")
new_issue = self.jira.create_issue(project='TEST', summary='ZID error on node {}'.format(node),
description=str(self.active_zid_errors[node]), issuetype={'name': 'Bug'}, assignee={'name': self.msre_oncall_dict['Product SRE Primary']},labels=labels)
print("Created a new ticket: ", new_issue.key, new_issue.fields.summary)
self.active_zid_errors[node][JIRA_DICT_KEY] = new_issue.key
else:
print("Current onCalls were not pulled from Duty, create jira with defautl assignee..")
new_issue = self.jira.create_issue(project='TEST', summary='ZID error on node {}'.format(node),
description=str(self.active_zid_errors[node]), issuetype={'name': 'Bug'},labels=labels)
print("Created a new ticket: ", new_issue.key, new_issue.fields.summary)
self.active_zid_errors[node][JIRA_DICT_KEY] = new_issue.key
if __name__== "__main__":
o = ZidFinder()

Python: Subclassing a dict to have two keys and a defaultvalue

following the two very readable tutorials 1 and 2, I would like to create a dictionary with two keys that gives a defaultvalue in case the key-pair does not exist.
I managed two fullfill the first condition with
from collections import defaultdict
class DictX(dict):
def __getattr__(self, key1 = None, key2 = None):
try:
return self[(key1,key2)]
# This in idea of how to implement the defaultdict. But it does not seem to work
# except KeyError as k::
# self[(key1,key2)] = 0.
# return self[(key1,key2)]
## or just return 0
except KeyError as k:
raise AttributeError(k)
def __setattr__(self, key1, key2, value):
self[(key1,key2)] = value
def __delattr__(self, key):
try:
del self[key]
except KeyError as k:
raise AttributeError(k)
def __repr__(self):
return '<DictX ' + dict.__repr__(self) + '>'
sampledict = DictX()
sampledict[3,5] = 5
sampledict[1,4] = 4
print("Checking the dict ",sampledict[1,4])
# This line is going to throw an error
print("Checking the default dict ",sampledict[3,6])
How do I code the defaultvalue behaviour?
Pro-Question:
If I just give one value sampledict[1,] or sampledict[1,:], I would like to get a list of all key - value pairs that start with 1. Is that possible?

bad performance for loop with instance and bulk create

I need to use bulk_create to create a lot of "detalle"(details), the problem is i have to iterate trough a json to get the arguments, and i got 4 fk so django ask to me for the instance, not the id. but to have id i have to do a .get(), so i got a bad performance, because its 4 gets by each iteration.
its there a way to get all objects instances and put in a list or something to perform load then the instance without using get every time?
class DetalleFichaAllViewSet(viewsets.ModelViewSet):
serializer_class = DetalleFichaUpdateAllSerializer
def create(self, request, *args, **kwargs):
user = self.request.user
data = request.data
try:
ficha = Ficha.objects.get(autor=user.id)
DetalleFicha.objects.filter(ficha=ficha.id).delete()
except Http404:
pass
# Create Ficha
now = datetime.now()
date_time = now.strftime("%Y-%m-%d %H:%M")
print("AAAAAA DATA:", data)
Ficha.objects.filter(autor=user.id).update(fecha_creacion=date_time, autor=user,
nombre=data["nombreFicha"], descripcion=data["descripcionFicha"])
ficha = Ficha.objects.filter(autor=user.id).last()
recintos = Recinto.objects.all()
productos = Producto.objects.all()
estandar_productos = EstandarProducto.objects.all()
cotizaciones = Cotizacion.objects.all()
detalles_ficha = []
for detalle in data["detalles"]:
recinto = recintos.get(id=detalle[1])
producto = productos.get(id=detalle[10])
estandar_producto = estandar_productos.get(id=detalle[9])
try:
cotizacion = cotizaciones.get(id=detalle[4])
except ObjectDoesNotExist:
cotizacion = None
print("Fecha: ", detalle[8])
detalle = DetalleFicha(carreras=detalle[0],
recinto=recinto, nombre=detalle[2],
cantidad_a_comprar=detalle[3], cotizacion=cotizacion,
valor_unitario=detalle[5], valor_total=detalle[6],
documento=detalle[7], fecha_cotizacion=detalle[8],
estandar_producto=estandar_producto, producto=producto,
ficha=ficha)
detalles_ficha.append(detalle)
DetalleFicha.objects.bulk_create(detalles_ficha)
print("Array convertida", detalles_ficha)
print(detalles_ficha[0])
return Response(status=status.HTTP_200_OK)

An unexpected keyword argument '_metrics' in _pandas for custom expectations with great expectations v3 api?

I am trying to create a very simple expectation with Great Expectations v3 api: expect_column_values_to_be_positive. I am using PandasExecutionEngine and my data asset is a pandas dataframe.
my_custom_expectation.py is located in the plugins/ folder.
Here is my code in my_custom_expectation.py:
from great_expectations.execution_engine import (
PandasExecutionEngine,
)
from great_expectations.expectations.metrics import (
ColumnMapMetricProvider,
column_condition_partial
)
from great_expectations.expectations.expectation import (
ColumnMapExpectation,
)
def check_positive(value):
if value:
return True if value > 0 else False
return True
class ColumnValueIsPositive(ColumnMapMetricProvider):
condition_metric_name = "column_values.to_be_positive"
#column_condition_partial(engine=PandasExecutionEngine)
def _pandas(cls, column, **kwargs):
return column.apply(lambda x: check_positive(x))
class ExpectColumnValuesToBePositive(ColumnMapExpectation):
map_metric = "column_values.to_be_positive"
Then in my jupyter notebook, I try to create my expectation:
from my_custom_expectation import ExpectColumnValuesToBePositive
validator.expect_column_values_to_be_positive(column="duration")
However, I get the following error:
TypeError Traceback (most recent call last)
/tmp/ipykernel_5957/859745029.py in <module>
----> 1 validator.expect_column_values_to_be_positive(column="duration")
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/validator/validator.py in inst_expectation(*args, **kwargs)
285
286 else:
--> 287 raise err
288 return validation_result
289
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/validator/validator.py in inst_expectation(*args, **kwargs)
240 )
241 else:
--> 242 validation_result = expectation.validate(
243 validator=self,
244 evaluation_parameters=self._expectation_suite.evaluation_parameters,
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/expectations/expectation.py in validate(self, validator, configuration, evaluation_parameters, interactive_evaluation, data_context, runtime_configuration)
631 evaluation_parameters, interactive_evaluation, data_context
632 )
--> 633 evr = validator.graph_validate(
634 configurations=[configuration],
635 runtime_configuration=runtime_configuration,
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/validator/validator.py in graph_validate(self, configurations, metrics, runtime_configuration)
499 return evrs
500 else:
--> 501 raise err
502
503 for configuration in processed_configurations:
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/validator/validator.py in graph_validate(self, configurations, metrics, runtime_configuration)
477 # an exception occurring as part of resolving the combined validation graph impacts all expectations in suite.
478 try:
--> 479 self.resolve_validation_graph(
480 graph=graph,
481 metrics=metrics,
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/validator/validator.py in resolve_validation_graph(self, graph, metrics, runtime_configuration)
555
556 metrics.update(
--> 557 self._resolve_metrics(
558 execution_engine=self._execution_engine,
559 metrics_to_resolve=ready_metrics,
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/validator/validator.py in _resolve_metrics(execution_engine, metrics_to_resolve, metrics, runtime_configuration)
603 """A means of accessing the Execution Engine's resolve_metrics method, where missing metric configurations are
604 resolved"""
--> 605 return execution_engine.resolve_metrics(
606 metrics_to_resolve=metrics_to_resolve,
607 metrics=metrics,
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/execution_engine/execution_engine.py in resolve_metrics(self, metrics_to_resolve, metrics, runtime_configuration)
283 # than data to optimize compute in the future
284 try:
--> 285 resolved_metrics[metric_to_resolve.id] = metric_fn(
286 **metric_provider_kwargs
287 )
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/expectations/metrics/metric_provider.py in inner_func(*args, **kwargs)
53 #wraps(metric_fn)
54 def inner_func(*args, **kwargs):
---> 55 return metric_fn(*args, **kwargs)
56
57 inner_func.metric_engine = engine
~/.local/share/virtualenvs/ge-YbASoQtb/lib/python3.8/site-packages/great_expectations/expectations/metrics/map_metric_provider.py in inner_func(cls, execution_engine, metric_domain_kwargs, metric_value_kwargs, metrics, runtime_configuration)
326 df = df[df[column_name].notnull()]
327
--> 328 meets_expectation_series = metric_fn(
329 cls,
330 df[column_name],
TypeError: _pandas() got an unexpected keyword argument '_metrics'
Am I missing something? I followed this example to write my expectation/metric.

I was able to create ny own expectations in the end by literally copy pasting everything from here. So my code looks like this in the end:
from typing import Dict
from great_expectations.core.expectation_configuration import parse_result_format
from great_expectations.expectations.expectation import (
ColumnMapExpectation,
_format_map_output,
)
from great_expectations.expectations.util import render_evaluation_parameter_string
from great_expectations.render.renderer.renderer import renderer
from great_expectations.render.types import RenderedStringTemplateContent
from great_expectations.render.util import (
num_to_str,
parse_row_condition_string_pandas_engine,
substitute_none_for_missing,
)
from typing import Optional
from great_expectations.core import ExpectationConfiguration
from great_expectations.execution_engine import (
ExecutionEngine,
PandasExecutionEngine,
)
from great_expectations.expectations.metrics.map_metric_provider import (
ColumnMapMetricProvider,
column_condition_partial,
)
from great_expectations.expectations.metrics.metric_provider import (
MetricProvider,
metric_value,
)
from great_expectations.validator.validation_graph import MetricConfiguration
class ColumnValuesPositive(ColumnMapMetricProvider):
condition_metric_name = "column_values.positive"
# filter_column_isnull = False
#column_condition_partial(engine=PandasExecutionEngine)
def _pandas(cls, column, **kwargs):
print("calling pandas func in custom expectstion ******")
def check_positive(value):
print("calling check_positive.....")
if value:
return True if value > 0 else False
return True
return column.apply(lambda x: check_positive(x))
class ColumnValuesPositiveCount(MetricProvider):
"""A convenience class to provide an alias for easier access to the null count in a column."""
metric_name = "column_values.positive.count"
#metric_value(engine=PandasExecutionEngine)
def _pandas(*, metrics, **kwargs):
return metrics["column_values.nonpositive.unexpected_count"]
#classmethod
def _get_evaluation_dependencies(
cls,
metric: MetricConfiguration,
configuration: Optional[ExpectationConfiguration] = None,
execution_engine: Optional[ExecutionEngine] = None,
runtime_configuration: Optional[dict] = None,
):
dependencies: dict = super()._get_evaluation_dependencies(
metric=metric,
configuration=configuration,
execution_engine=execution_engine,
runtime_configuration=runtime_configuration,
)
dependencies["column_values.nonpositive.unexpected_count"] = MetricConfiguration(
metric_name="column_values.nonpositive.unexpected_count",
metric_domain_kwargs=metric.metric_domain_kwargs,
)
return dependencies
class ExpectColumnValuesToBePositive(ColumnMapExpectation):
"""Expect column values to be positive.
expect_column_values_to_be_positive is a \
:func:`column_map_expectation <great_expectations.execution_engine.execution_engine.MetaExecutionEngine
.column_map_expectation>`.
Args:
column (str): \
The column name.
Keyword Args:
mostly (None or a float between 0 and 1): \
Return `"success": True` if at least mostly fraction of values match the expectation. \
For more detail, see :ref:`mostly`.
Other Parameters:
result_format (str or None): \
Which output mode to use: `BOOLEAN_ONLY`, `BASIC`, `COMPLETE`, or `SUMMARY`.
For more detail, see :ref:`result_format <result_format>`.
include_config (boolean): \
If True, then include the expectation config as part of the result object. \
For more detail, see :ref:`include_config`.
catch_exceptions (boolean or None): \
If True, then catch exceptions and include them as part of the result object. \
For more detail, see :ref:`catch_exceptions`.
meta (dict or None): \
A JSON-serializable dictionary (nesting allowed) that will be included in the output without \
modification. For more detail, see :ref:`meta`.
Returns:
An ExpectationSuiteValidationResult
Exact fields vary depending on the values passed to :ref:`result_format <result_format>` and
:ref:`include_config`, :ref:`catch_exceptions`, and :ref:`meta`.
"""
# This dictionary contains metadata for display in the public gallery
library_metadata = {
"maturity": "production",
"package": "great_expectations",
"tags": ["core expectation", "column map expectation"],
"contributors": ["#great_expectations"],
"requirements": [],
}
map_metric = "column_values.positive"
#classmethod
#renderer(renderer_type="renderer.prescriptive")
#render_evaluation_parameter_string
def _prescriptive_renderer(
cls,
configuration=None,
result=None,
language=None,
runtime_configuration=None,
**kwargs
):
runtime_configuration = runtime_configuration or {}
include_column_name = runtime_configuration.get("include_column_name", True)
include_column_name = (
include_column_name if include_column_name is not None else True
)
styling = runtime_configuration.get("styling")
params = substitute_none_for_missing(
configuration.kwargs,
["column", "mostly", "row_condition", "condition_parser"],
)
if params["mostly"] is not None:
params["mostly_pct"] = num_to_str(
params["mostly"] * 100, precision=15, no_scientific=True
)
# params["mostly_pct"] = "{:.14f}".format(params["mostly"]*100).rstrip("0").rstrip(".")
template_str = "values must be positive, at least $mostly_pct % of the time."
else:
template_str = "values must be positive."
if include_column_name:
template_str = "$column " + template_str
if params["row_condition"] is not None:
(
conditional_template_str,
conditional_params,
) = parse_row_condition_string_pandas_engine(params["row_condition"])
template_str = conditional_template_str + ", then " + template_str
params.update(conditional_params)
return [
RenderedStringTemplateContent(
**{
"content_block_type": "string_template",
"string_template": {
"template": template_str,
"params": params,
"styling": styling,
},
}
)
]
#classmethod
#renderer(renderer_type="renderer.diagnostic.observed_value")
def _diagnostic_observed_value_renderer(
cls,
configuration=None,
result=None,
language=None,
runtime_configuration=None,
**kwargs
):
result_dict = result.result
try:
notpositive_percent = result_dict["unexpected_percent"]
return (
num_to_str(100 - notpositive_percent, precision=5, use_locale=True)
+ "% positive"
)
except KeyError:
return "unknown % positive"
except TypeError:
return "NaN% positive"
def get_validation_dependencies(
self,
configuration: Optional[ExpectationConfiguration] = None,
execution_engine: Optional[ExecutionEngine] = None,
runtime_configuration: Optional[dict] = None,
):
dependencies = super().get_validation_dependencies(
configuration, execution_engine, runtime_configuration
)
return dependencies
def _validate(
self,
configuration: ExpectationConfiguration,
metrics: Dict,
runtime_configuration: dict = None,
execution_engine: ExecutionEngine = None,
):
if runtime_configuration:
result_format = runtime_configuration.get(
"result_format",
configuration.kwargs.get(
"result_format", self.default_kwarg_values.get("result_format")
),
)
else:
result_format = configuration.kwargs.get(
"result_format", self.default_kwarg_values.get("result_format")
)
mostly = self.get_success_kwargs().get(
"mostly", self.default_kwarg_values.get("mostly")
)
total_count = metrics.get("table.row_count")
unexpected_count = metrics.get(self.map_metric + ".unexpected_count")
if total_count is None or total_count == 0:
# Vacuously true
success = True
else:
success_ratio = (total_count - unexpected_count) / total_count
success = success_ratio >= mostly
nonnull_count = None
return _format_map_output(
result_format=parse_result_format(result_format),
success=success,
element_count=metrics.get("table.row_count"),
nonnull_count=nonnull_count,
unexpected_count=metrics.get(self.map_metric + ".unexpected_count"),
unexpected_list=metrics.get(self.map_metric + ".unexpected_values"),
unexpected_index_list=metrics.get(
self.map_metric + ".unexpected_index_list"
),
)

PdfMiner: Erro processing the page literal required: /b'begin'

I am trying to read .pdf file using python3 with package called pdfminer which I have done successfully but for some of the page in .pdf file while reading the page using interpreter.process_page in getAllPages() of the following code I am getting an errors as follows:
error processing the page literal required: /b'begin'.
error processing the page Unknown operator: 'Qq'.
This is happening only for few docs but not able to find out what is the problem , in which case this could happen?
Code:-
class PDFDoc():
def __init__(self):
self.rsrcmgr = PDFResourceManager()
self.laparams = LAParams()
self.device = PDFPageDetailedAggregator(self.rsrcmgr, laparams=self.laparams)
self.interpreter = PDFPageInterpreter(self.rsrcmgr, self.device)
self.doc_values = []
self.total_no_of_pages = 0
self.doc_page_dict = collections.OrderedDict()
# self.doc = None
"""
Read PDF Document
"""
def readDoc(self, doc_name):
fp = open(doc_name, 'rb')
self.parser = PDFParser(fp)
self.doc = PDFDocument(self.parser)
"""
Read all pages in the document and saved in List of tuples format.
It contains the text and their coordinate info along with page number
"""
def getAllPages(self):
for page in PDFPage.create_pages(self.doc):
self.interpreter.process_page(page)
# receive the LTPage object for this page
self.device.get_result()
self.doc_values = self.device.rows
"""
Get the total number of pages
"""
def getTotalPages(self):
self.total_no_of_pages = max(self.doc_page_dict)+1
"""
Convert the document info into Page-wise dict. {Key:Value}-->{Page no:[Page text, coordinates]}
"""
def getPageDict(self):
for i in range(len(self.doc_values)):
left = self.doc_values[i][1]
bottom = self.doc_values[i][2]
content = self.doc_values[i][-1]
if self.doc_page_dict.get(self.doc_values[i][0]):
self.doc_page_dict[self.doc_values[i][0]].append({'left':left, 'bottom':bottom, 'content':content})
else:
self.doc_page_dict[self.doc_values[i][0]]=[{'left':left, 'bottom':bottom, 'content':content}]
"""
Align the page text in case they are misaligned
"""
def create_page_table_modified(self, pagedict_list):
# ##print(pagedict_list)
page_dict = collections.OrderedDict()
page_table_1 = []
page_table = []
exc_arr = []
count = 0
for line in pagedict_list:
row = []
temp_key = float(line['bottom'])
if not line in exc_arr and line["content"]:
row.append(line)
exc_arr.append(line)
for line_1 in pagedict_list:
if not line_1 in exc_arr and line_1["content"]:
# #print('last_top:', last_top, each_dict_adjusted['bottom'])
if abs(int(line["bottom"]) - int(line_1["bottom"])) <= 6:
row.append(line_1)
exc_arr.append(line_1)
if row:
page_dict[temp_key] = row
page_table.append(row)
count += 1
# ##print("\n\nPage:",page_table)
page_dict_keys = sorted(page_dict, reverse=True)
for i in page_dict_keys:
# i = sorted(i, key=lambda k: k['left'])
page_table_1.append(page_dict[i])
return page_table_1
"""
Sort the line elements based on its position coordinates
"""
def sortRowElements(self,row_list):
return sorted(row_list, key=lambda k:k['left'])
"""
Combine line elements to form the line text
"""
def combineText(self, row):
temp_ = []
# for i in range(len(row)):
text = [k['content'] for k in row]
temp_.append(' '.join(text))
return ' '.join(temp_)
"""
To call aligning and sorting functions
"""
def sortText(self):
for page in self.doc_page_dict:
self.doc_page_dict[page] = self.create_page_table_modified(self.doc_page_dict[page])
self.doc_page_dict[page] = [self.sortRowElements(line) for line in self.doc_page_dict[page]]
"""
To get text from particular page of the document --> List of line text
"""
def pageText(self, page_no):
page_text = [self.combineText(line) for line in self.doc_page_dict[page_no]]
return page_text
read_document = PDFDoc()

Develop Reference

node.js excel linux python-3.x azure haskell apache-spark rust .htaccess string

upgrade Django to 1.11 - formset changes - python-3.5

Related

loop efficiency and performance impact calling api in python

Python: Subclassing a dict to have two keys and a defaultvalue

bad performance for loop with instance and bulk create

An unexpected keyword argument '_metrics' in _pandas for custom expectations with great expectations v3 api?

PdfMiner: Erro processing the page literal required: /b'begin'

Categories

Resources