secretary/secretary.py

566 lines
20 KiB
Python
Raw Normal View History

2012-07-19 05:33:21 -06:00
#!/usr/bin/python
# * Copyright (c) 2012-2014 Christopher Ramirez chris.ramirezg@gmail.com
2014-07-18 23:58:49 -06:00
# *
# * Licensed under the MIT license.
2012-07-19 05:33:21 -06:00
"""
Secretary
2014-07-19 00:01:13 -06:00
This project is a document engine which make use of LibreOffice
documents as templates and use the semantics of jinja2 to control
variable printing and control flow.
2012-07-19 05:33:21 -06:00
2014-07-19 00:01:13 -06:00
To render a template:
engine = Renderer(template_file)
2014-07-19 00:01:13 -06:00
result = engine.render(template_var1=...)
2012-07-19 05:33:21 -06:00
"""
from __future__ import unicode_literals, print_function
2014-07-11 14:14:22 -06:00
import io
import re
import sys
2014-07-11 14:14:22 -06:00
import logging
import zipfile
from xml.dom.minidom import parseString
from jinja2 import Environment, Undefined
FLOW_REFERENCES = {
'text:p' : 'text:p',
'paragraph' : 'text:p',
'before::paragraph' : 'text:p',
'after::paragraph' : 'text:p',
'table:table-row' : 'table:table-row',
'table-row' : 'table:table-row',
'row' : 'table:table-row',
'before::table-row' : 'table:table-row',
'after::table-row' : 'table:table-row',
'before::row' : 'table:table-row',
'after::row' : 'table:table-row',
'table:table-cell' : 'table:table-cell',
'table-cell' : 'table:table-cell',
'cell' : 'table:table-cell',
'before::table-cell' : 'table:table-cell',
'after::table-cell' : 'table:table-cell',
'before::cell' : 'table:table-cell',
'after::cell' : 'table:table-cell',
}
SUPPORTED_FIELD_REFERECES = ['text:p', 'table:table-row', 'table:table-cell']
2013-09-03 17:25:01 -06:00
# ---- Exceptions
class SecretaryError(Exception):
pass
class UndefinedSilently(Undefined):
# Silently undefined,
# see http://stackoverflow.com/questions/6182498
def silently_undefined(*args, **kwargs):
return ''
return_new = lambda *args, **kwargs: UndefinedSilently()
__unicode__ = silently_undefined
__str__ = silently_undefined
__call__ = return_new
__getattr__ = return_new
# ************************************************
2013-07-31 14:49:54 -06:00
#
# SECRETARY FILTERS
2013-07-31 14:49:54 -06:00
#
# ************************************************
def pad_string(value, length=5):
value = str(value)
return value.zfill(length)
class Renderer(object):
"""
2013-07-21 13:13:39 -06:00
Main engine to convert and ODT document into a jinja
2013-08-07 09:25:23 -06:00
compatible template.
2013-07-31 14:49:54 -06:00
2013-07-21 13:13:39 -06:00
Basic use example:
engine = Renderer('template')
2013-07-21 13:13:39 -06:00
result = engine.render()
2013-08-07 09:25:23 -06:00
Renderer provides an enviroment variable which can be used
2013-08-07 09:25:23 -06:00
to provide custom filters to the ODF render.
engine = Renderer('template.odt')
2013-08-07 09:25:23 -06:00
engine.environment.filters['custom_filer'] = filter_function
result = engine.render()
"""
def __init__(self, environment=None, **kwargs):
"""
Create a Renderer instance.
args:
environment: Use this jinja2 enviroment. If not specified, we
create a new environment for this class instance.
returns:
None
"""
2014-07-11 14:14:22 -06:00
self.log = logging.getLogger(__name__)
self.log.debug('Initing a Renderer instance\nTemplate')
if environment:
self.environment = environment
else:
self.environment = Environment(undefined=UndefinedSilently,
autoescape=True)
# Register filters
self.environment.filters['pad'] = pad_string
self.environment.filters['markdown'] = self.markdown_filter
def _unpack_template(self, template):
# And Open/libreOffice is just a ZIP file. Here we unarchive the file
# and return a dict with every file in the archive
self.log.debug('Unpacking template file')
archive_files = {}
with zipfile.ZipFile(template, 'r') as archive:
for zfile in archive.filelist:
archive_files[zfile.filename] = archive.read(zfile.filename)
2013-07-31 14:49:54 -06:00
return archive_files
self.log.debug('Unpack completed')
2014-07-11 14:14:22 -06:00
def _pack_document(self, files):
# Store to a zip files in files
self.log.debug('packing document')
zip_file = io.BytesIO()
with zipfile.ZipFile(zip_file, 'a') as zipdoc:
for fname, content in files.items():
if sys.version_info >= (2, 7):
zipdoc.writestr(fname, content, zipfile.ZIP_DEFLATED)
else:
zipdoc.writestr(fname, content)
self.log.debug('Document packing completed')
return zip_file
def _prepare_template_tags(self, xml_document):
# Here we search for every field node present in xml_document.
# For each field we found we do:
# * if field is a print field ({{ field }}), we replace it with a
# <text:span> node.
#
# * if field is a control flow ({% %}), then we find immediate node of
# type indicated in field's `text:description` attribute and replace
# the whole node and its childrens with field's content.
#
# If `text:description` attribute starts with `before::` or `after::`,
# then we move field content before or after the node in description.
#
# If no `text:description` is available, find the immediate common
# parent of this and any other field and replace its child and
# original parent of field with the field content.
#
# e.g.: original
# <table>
# <table:row>
# <field>{% for bar in bars %}</field>
# </table:row>
# <paragraph>
# <field>{{ bar }}</field>
# </paragraph>
# <table:row>
# <field>{% endfor %}</field>
# </table:row>
# </table>
#
# After processing:
# <table>
# {% for bar in bars %}
# <paragraph>
# <text:span>{{ bar }}</text:span>
# </paragraph>
# {% endfor %}
# </table>
self.log.debug('Preparing template tags')
fields = xml_document.getElementsByTagName('text:text-input')
# First, count secretary fields
for field in fields:
if not field.hasChildNodes():
continue
field_content = field.childNodes[0].data.strip()
if not re.findall(r'^{[{|%].*[%|}]}$', field_content, re.DOTALL):
# Field does not contains jinja template tags
continue
is_block_tag = re.findall(r'^{%[^{}]*%}$', field_content, re.DOTALL)
self.inc_node_fields_count(field.parentNode,
'block' if is_block_tag else 'variable')
# Do field replacement and moving
for field in fields:
if not field.hasChildNodes():
continue
field_content = field.childNodes[0].data.strip()
if not re.findall(r'^{[{|%].*[%|}]}$', field_content, re.DOTALL):
# Field does not contains jinja template tags
continue
is_block_tag = re.findall(r'^{%[^{}]*%}$', field_content, re.DOTALL)
discard = field
field_reference = field.getAttribute('text:description').strip().lower()
if re.findall(r'\|markdown', field_content):
# a markdown field should take the whole paragraph
field_reference = 'text:p'
if field_reference:
# User especified a reference. Replace immediate parent node
# of type indicated in reference with this field's content.
node_type = FLOW_REFERENCES.get(field_reference, False)
if node_type:
discard = self._parent_of_type(field, node_type)
jinja_node = self.create_text_node(xml_document, field_content)
elif is_block_tag:
# Find the common immediate parent of this and any other field.
while discard.parentNode.secretary_field_count <= 1:
discard = discard.parentNode
if discard is not None:
jinja_node = self.create_text_node(xml_document,
field_content)
else:
jinja_node = self.create_text_span_node(xml_document,
field_content)
parent = discard.parentNode
if not field_reference.startswith('after::'):
parent.insertBefore(jinja_node, discard)
else:
if discard.isSameNode(parent.lastChild):
parent.appendChild(jinja_node)
else:
parent.insertBefore(jinja_node,
discard.nextSibling)
if field_reference.startswith(('after::', 'before::')):
# Do not remove whole field container. Just remove the
# <text:text-input> parent node if field has it.
discard = self._parent_of_type(field, 'text:p')
parent = discard.parentNode
parent.removeChild(discard)
def _unescape_entities(self, xml_text):
# unescape XML entities gt and lt
unescape_rules = {
r'({[{|%].*)(&gt;)(.*[%|}]})': r'\1>\3',
r'({[{|%].*)(&lt;)(.*[%|}]})': r'\1<\3',
}
for p, r in unescape_rules.iteritems():
xml_text = re.sub(p, r, xml_text, flags=re.IGNORECASE or re.DOTALL)
return xml_text
def _encode_escape_chars(self, xml_text):
encode_rules = {
r'(<text:([ahp]|ruby-base|span|meta|meta-field)>.*)(\\n)(.*</text:([ahp]|ruby-base|span|meta|meta-field)>)': r'\1<text:line-break/>\3',
r'(<text:([ahp]|ruby-base|span|meta|meta-field)>.*)(\\n)(.*</text:([ahp]|ruby-base|span|meta|meta-field)>)': r'\1<text:line-break/>\3',
r'(<text:([ahp]|ruby-base|span|meta|meta-field)>.*)(\\n)(.*</text:([ahp]|ruby-base|span|meta|meta-field)>)': r'\1<text:tab>\3',
ur'[\u0009|\u000d|\u000a]': r'<text:s/>'
}
for p, r in encode_rules.iteritems():
xml_text = re.sub(p, r, xml_text, flags=re.IGNORECASE)
return xml_text
def _render_xml(self, xml_document, **kwargs):
# Prepare the xml object to be processed by jinja2
self.log.debug('Rendering XML object')
try:
self._prepare_template_tags(xml_document)
template_string = self._unescape_entities(xml_document.toxml())
jinja_template = self.environment.from_string(template_string)
result = jinja_template.render(**kwargs)
result = self._encode_escape_chars(result)
return parseString(result.encode('ascii', 'xmlcharrefreplace'))
except:
self.log.debug('Error rendering template:\n%s', template_string)
raise
finally:
self.log.debug('Rendering xml object finished')
def render(self, template, **kwargs):
"""
Render a template
args:
template: A template file. Could be a string or a file instance
**kwargs: Template variables. Similar to jinja2
2014-06-16 12:10:17 -06:00
returns:
A binary stream which contains the rendered document.
"""
self.log.debug('Initing a template rendering')
self.files = self._unpack_template(template)
# Keep content and styles object since many functions or
# filters may work with then
self.content = parseString(self.files['content.xml'])
self.styles = parseString(self.files['styles.xml'])
# Render content.xml
self.content = self._render_xml(self.content, **kwargs)
2014-07-11 14:14:22 -06:00
# Render styles.xml
self.styles = self._render_xml(self.styles, **kwargs)
self.log.debug('Template rendering finished')
self.files['content.xml'] = self.content.toxml().encode('ascii', 'xmlcharrefreplace')
self.files['styles.xml'] = self.styles.toxml().encode('ascii', 'xmlcharrefreplace')
document = self._pack_document(self.files)
return document.getvalue()
def _parent_of_type(self, node, of_type):
# Returns the first immediate parent of type `of_type`.
# Returns None if nothing is found.
if hasattr(node, 'parentNode'):
if node.parentNode.nodeName.lower() == of_type:
return node.parentNode
else:
return self._parent_of_type(node.parentNode, of_type)
else:
return None
def create_text_span_node(self, xml_document, content):
span = xml_document.createElement('text:span')
text_node = self.create_text_node(xml_document, content)
span.appendChild(text_node)
return span
def create_text_node(self, xml_document, text):
"""
Creates a text node
"""
return xml_document.createTextNode(text)
2014-06-15 22:51:29 -06:00
def inc_node_fields_count(self, node, field_type='variable'):
""" Increase field count of node and its parents """
if node is None:
return
if not hasattr(node, 'secretary_field_count'):
setattr(node, 'secretary_field_count', 0)
if not hasattr(node, 'secretary_variable_count'):
setattr(node, 'secretary_variable_count', 0)
if not hasattr(node, 'secretary_block_count'):
setattr(node, 'secretary_block_count', 0)
node.secretary_field_count += 1
if field_type == 'variable':
node.secretary_variable_count += 1
else:
node.secretary_block_count += 1
self.inc_node_fields_count(node.parentNode, field_type)
def get_style_by_name(self, style_name):
"""
Search in <office:automatic-styles> for style_name.
Return None if style_name is not found. Otherwise
return the style node
"""
auto_styles = self.content.getElementsByTagName(
'office:automatic-styles')[0]
if not auto_styles.hasChildNodes():
return None
for style_node in auto_styles.childNodes:
if style_node.hasAttribute('style:name') and \
(style_node.getAttribute('style:name') == style_name):
return style_node
return None
def insert_style_in_content(self, style_name, attributes=None,
**style_properties):
"""
Insert a new style into content.xml's <office:automatic-styles> node.
Returns a reference to the newly created node
"""
auto_styles = self.content.getElementsByTagName('office:automatic-styles')[0]
style_node = self.content.createElement('style:style')
style_node.setAttribute('style:name', style_name)
style_node.setAttribute('style:family', 'text')
style_node.setAttribute('style:parent-style-name', 'Standard')
if attributes:
for k, v in attributes.iteritems():
style_node.setAttribute('style:%s' % k, v)
if style_properties:
style_prop = self.content.createElement('style:text-properties')
for k, v in style_properties.iteritems():
style_prop.setAttribute('%s' % k, v)
style_node.appendChild(style_prop)
return auto_styles.appendChild(style_node)
def markdown_filter(self, markdown_text):
"""
Convert a markdown text into a ODT formated text
"""
2014-02-03 16:39:27 -06:00
if not isinstance(markdown_text, basestring):
return ''
from xml.dom import Node
from markdown_map import transform_map
try:
from markdown2 import markdown
except ImportError:
raise SecretaryError('Could not import markdown2 library. Install it using "pip install markdown2"')
styles_cache = {} # cache styles searching
html_text = markdown(markdown_text)
xml_object = parseString('<html>%s</html>' % html_text.encode('ascii', 'xmlcharrefreplace'))
# Transform HTML tags as specified in transform_map
# Some tags may require extra attributes in ODT.
# Additional attributes are indicated in the 'attributes' property
for tag in transform_map:
html_nodes = xml_object.getElementsByTagName(tag)
for html_node in html_nodes:
odt_node = xml_object.createElement(transform_map[tag]['replace_with'])
# Transfer child nodes
if html_node.hasChildNodes():
for child_node in html_node.childNodes:
odt_node.appendChild(child_node.cloneNode(True))
# Add style-attributes defined in transform_map
if 'style_attributes' in transform_map[tag]:
for k, v in transform_map[tag]['style_attributes'].iteritems():
odt_node.setAttribute('text:%s' % k, v)
# Add defined attributes
if 'attributes' in transform_map[tag]:
for k, v in transform_map[tag]['attributes'].iteritems():
odt_node.setAttribute(k, v)
# copy original href attribute in <a> tag
if tag == 'a':
if html_node.hasAttribute('href'):
odt_node.setAttribute('xlink:href',
html_node.getAttribute('href'))
# Does the node need to create an style?
if 'style' in transform_map[tag]:
name = transform_map[tag]['style']['name']
if not name in styles_cache:
style_node = self.get_style_by_name(name)
if style_node is None:
# Create and cache the style node
style_node = self.insert_style_in_content(
name, transform_map[tag]['style'].get('attributes', None),
**transform_map[tag]['style']['properties'])
styles_cache[name] = style_node
html_node.parentNode.replaceChild(odt_node, html_node)
def node_to_string(node):
result = node.toxml()
# linebreaks in preformated nodes should be converted to <text:line-break/>
if (node.__class__.__name__ != 'Text') and \
(node.getAttribute('text:style-name') == 'Preformatted_20_Text'):
result = result.replace('\n', '<text:line-break/>')
# All double linebreak should be replaced with an empty paragraph
return result.replace('\n\n', '<text:p text:style-name="Standard"/>')
2013-09-03 17:25:01 -06:00
return ''.join(node_as_str for node_as_str in map(node_to_string,
xml_object.getElementsByTagName('html')[0].childNodes))
def render_template(template, **kwargs):
"""
Render a ODF template file
"""
engine = Renderer(file)
return engine.render(**kwargs)
if __name__ == "__main__":
import os
from datetime import datetime
def read(fname):
return open(os.path.join(os.path.dirname(__file__), fname)).read()
document = {
'datetime': datetime.now(),
'md_sample': read('README.md')
}
countries = [
{'country': 'United States', 'capital': 'Washington', 'cities': ['miami', 'new york', 'california', 'texas', 'atlanta']},
{'country': 'England', 'capital': 'London', 'cities': ['gales']},
{'country': 'Japan', 'capital': 'Tokio', 'cities': ['hiroshima', 'nagazaki']},
{'country': 'Nicaragua', 'capital': 'Managua', 'cities': ['leon', 'granada', 'masaya']},
{'country': 'Argentina', 'capital': 'Buenos aires'},
{'country': 'Chile', 'capital': 'Santiago'},
{'country': 'Mexico', 'capital': 'MExico City', 'cities': ['puebla', 'cancun']},
]
render = Renderer()
result = render.render('simple_template.odt', countries=countries, document=document)
2013-07-31 14:49:54 -06:00
output = open('rendered.odt', 'wb')
output.write(result)
print("Template rendering finished! Check rendered.odt file.")