#!/usr/bin/python # * Copyright (c) 2012-2014 Christopher Ramirez chris.ramirezg@gmail.com # * # * Licensed under the MIT license. """ Secretary This project is a document engine which make use of LibreOffice documents as templates and use the semantics of jinja2 to control variable printing and control flow. To render a template: engine = Renderer(template_file) result = engine.render(template_var1=...) """ from __future__ import unicode_literals, print_function import io import re import sys import logging import zipfile from xml.dom.minidom import parseString from jinja2 import Environment, Undefined FLOW_REFERENCES = { 'text:p' : 'text:p', 'paragraph' : 'text:p', 'before::paragraph' : 'text:p', 'after::paragraph' : 'text:p', 'table:table-row' : 'table:table-row', 'table-row' : 'table:table-row', 'row' : 'table:table-row', 'before::table-row' : 'table:table-row', 'after::table-row' : 'table:table-row', 'before::row' : 'table:table-row', 'after::row' : 'table:table-row', 'table:table-cell' : 'table:table-cell', 'table-cell' : 'table:table-cell', 'cell' : 'table:table-cell', 'before::table-cell' : 'table:table-cell', 'after::table-cell' : 'table:table-cell', 'before::cell' : 'table:table-cell', 'after::cell' : 'table:table-cell', } SUPPORTED_FIELD_REFERECES = ['text:p', 'table:table-row', 'table:table-cell'] # ---- Exceptions class SecretaryError(Exception): pass class UndefinedSilently(Undefined): # Silently undefined, # see http://stackoverflow.com/questions/6182498/jinja2-how-to-make-it-fail-silently-like-djangotemplate def silently_undefined(*args, **kwargs): return '' return_new = lambda *args, **kwargs: UndefinedSilently() __unicode__ = silently_undefined __str__ = silently_undefined __call__ = return_new __getattr__ = return_new # ************************************************ # # SECRETARY FILTERS # # ************************************************ def pad_string(value, length=5): value = str(value) return value.zfill(length) class Renderer(object): """ Main engine to convert and ODT document into a jinja compatible template. Basic use example: engine = Renderer('template') result = engine.render() Renderer provides an enviroment variable which can be used to provide custom filters to the ODF render. engine = Renderer('template.odt') engine.environment.filters['custom_filer'] = filter_function result = engine.render() """ def __init__(self, template, **kwargs): """ Builds a Renderer instance and assign init the internal enviroment. Params: template: Either the path to the file, or a file-like object. If it is a path, the file will be open with mode read 'r'. """ self.log = logging.getLogger(__name__) self.log.debug('Initing a Renderer instance\nTemplate: %s', template) self.template = template self.environment = Environment(undefined=UndefinedSilently, autoescape=True) # Register provided filters self.environment.filters['pad'] = pad_string self.environment.filters['markdown'] = self.markdown_filter self.file_list = {} def unpack_template(self): """ Loads the template into a ZIP file, allowing to make CRUD operations into the ZIP archive. """ self.log.debug('Unpacking template file') with zipfile.ZipFile(self.template, 'r') as unpacked_template: # go through the files in source for zi in unpacked_template.filelist: file_contents = unpacked_template.read( zi.filename ) self.file_list[zi.filename] = file_contents self.log.debug('File "%s" unpacked', zi.filename) if zi.filename == 'content.xml': self.log.debug('Parsing content.xml\n%s', file_contents) self.content = parseString( file_contents ) elif zi.filename == 'styles.xml': self.log.debug('Parsing styles.xml\n%s', file_contents) self.styles = parseString( file_contents ) def pack_document(self): """ Make an archive from _unpacked_template """ # Save rendered content and headers self.rendered = io.BytesIO() self.log.debug('Packing document...') with zipfile.ZipFile(self.rendered, 'a') as packed_template: for filename, content in self.file_list.items(): if filename in ['content.xml', 'styles.xml']: self.log.debug( 'Trying to pack "%s" into archive and encoding it into ascii\n%s', filename, self.styles.toxml()) content = self.styles.toxml().encode('ascii', 'xmlcharrefreplace') if sys.version_info >= (2, 7): packed_template.writestr(filename, content, zipfile.ZIP_DEFLATED) self.log.debug('File "%s" packed into archive with ZIP_DEFLATED', filename) else: packed_template.writestr(filename, content) self.log.debug('File "%s" packed into archive', filename) def render(self, **kwargs): """ Unpack and render the internal template and returns the rendered ODF document. """ self.log.debug('render called with\n%s', kwargs) def unescape_gt_lt(text): # unescape XML entities gt and lt unescape_entities = { r'({[{|%].*)(>)(.*[%|}]})': r'\1>\3', r'({[{|%].*)(<)(.*[%|}]})': r'\1<\3', } for pattern, repl in unescape_entities.iteritems(): text = re.sub(pattern, repl, text, flags=re.IGNORECASE or re.DOTALL) self.log.debug('GT and LT tags successfully unescaped\n%s', text) return text self.unpack_template() # Render content.xml self.log.debug('Trying to render content.xml with jinja') self.prepare_template_tags(self.content) template = self.environment.from_string(unescape_gt_lt(self.content.toxml())) result = template.render(**kwargs) self.log.debug('Jinja2 successfully parsed content.xml') result = result.replace('\n', '') self.log.debug('Line breaks replaced successfully') # Replace original body with rendered body original_body = self.content.getElementsByTagName('office:body')[0] rendered_body = parseString(result.encode('ascii', 'xmlcharrefreplace')).getElementsByTagName('office:body')[0] self.log.debug( 'Replacing original document body with rendered version\n%s', result) document = self.content.getElementsByTagName('office:document-content')[0] document.replaceChild(rendered_body, original_body) self.log.debug('Original body replaced with the rendered version') # Render styles.xml self.log.debug('Trying to render styles.xml with jinja') self.prepare_template_tags(self.styles) template = self.environment.from_string(unescape_gt_lt(self.styles.toxml())) result = template.render(**kwargs) self.log.debug('Jinja2 successfully parsed styles.xml') result = result.replace('\n', '') self.log.debug('Lines break successfully encoded to .') self.log.debug('Now replacing template styles.xml with the rendered version') self.styles = parseString(result.encode('ascii', 'xmlcharrefreplace')) self.log.debug('New styles.xml file successfully parsed') self.pack_document() return self.rendered.getvalue() def node_parents(self, node, parent_type): """ Returns the first node's parent with name of parent_type If parent "text:p" is not found, returns None. """ if hasattr(node, 'parentNode'): if node.parentNode.nodeName.lower() == parent_type: return node.parentNode else: return self.node_parents(node.parentNode, parent_type) else: return None def create_text_span_node(self, xml_document, content): span = xml_document.createElement('text:span') text_node = self.create_text_node(xml_document, content) span.appendChild(text_node) return span def create_text_node(self, xml_document, text): """ Creates a text node """ return xml_document.createTextNode(text) def inc_node_fields_count(self, node, field_type='variable'): """ Increase field count of node and its parents """ if node is None: return if not hasattr(node, 'secretary_field_count'): setattr(node, 'secretary_field_count', 0) if not hasattr(node, 'secretary_variable_count'): setattr(node, 'secretary_variable_count', 0) if not hasattr(node, 'secretary_block_count'): setattr(node, 'secretary_block_count', 0) node.secretary_field_count += 1 if field_type == 'variable': node.secretary_variable_count += 1 else: node.secretary_block_count += 1 self.inc_node_fields_count(node.parentNode, field_type) def prepare_template_tags(self, xml_document): """ Search every field node in the inner template and replace them with a field. Flow tags are replaced with a blank node and moved into the ancestor tag defined in description field attribute. """ self.log.debug('Preparing template tags\n%s', xml_document.toxml()) fields = xml_document.getElementsByTagName('text:text-input') # First, count secretary fields for field in fields: if not field.hasChildNodes(): continue field_content = field.childNodes[0].data.replace('\n', '') if not re.findall(r'(\{.*?\}*})', field_content): # Field does not contains jinja template tags continue is_block_tag = re.findall(r'(^\{\%.*?\%\}*})$', field_content.strip()) self.inc_node_fields_count(field.parentNode, 'variable' if not is_block_tag else 'block') for field in fields: if field.hasChildNodes(): field_content = field.childNodes[0].data.replace('\n', '') if not re.findall(r'(\{.*?\}*})', field_content): # Field does not contains jinja template tags continue is_block_tag = re.findall(r'(^\{\%.*?\%\}*})$', field_content.strip()) keep_field = field field_reference = field.getAttribute('text:description') if re.findall(r'\|markdown', field_content): # a markdown field should take the whole paragraph field_reference = 'text:p' if not field_reference: if is_block_tag: # Find the node where this control flow field we # consider will be really needed. while field.parentNode.secretary_field_count <= 1: field = field.parentNode if field is not None: jinja_tag_node = self.create_text_node(xml_document, field_content) else: jinja_tag_node = self.create_text_span_node(xml_document, field_content) else: odt_reference = FLOW_REFERENCES.get(field_reference.strip(), field_reference) if odt_reference in SUPPORTED_FIELD_REFERECES: field = self.node_parents(field, odt_reference) jinja_tag_node = self.create_text_node(xml_document, field_content) parent = field.parentNode if not field_reference.startswith('after::'): parent.insertBefore(jinja_tag_node, field) else: if field.isSameNode(parent.lastChild): parent.appendChild(jinja_tag_node) else: parent.insertBefore(jinja_tag_node, field.nextSibling) if field_reference.startswith('after::') or field_reference.startswith('before::'): # Avoid removing whole container, just original text:p parent field = self.node_parents(keep_field, 'text:p') parent = field.parentNode parent.removeChild(field) def get_style_by_name(self, style_name): """ Search in for style_name. Return None if style_name is not found. Otherwise return the style node """ auto_styles = self.content.getElementsByTagName('office:automatic-styles')[0] if not auto_styles.hasChildNodes(): return None for style_node in auto_styles.childNodes: if style_node.hasAttribute('style:name') and \ (style_node.getAttribute('style:name') == style_name): return style_node return None def insert_style_in_content(self, style_name, attributes=None, **style_properties): """ Insert a new style into content.xml's node. Returns a reference to the newly created node """ auto_styles = self.content.getElementsByTagName('office:automatic-styles')[0] style_node = self.content.createElement('style:style') style_node.setAttribute('style:name', style_name) style_node.setAttribute('style:family', 'text') style_node.setAttribute('style:parent-style-name', 'Standard') if attributes: for k, v in attributes.iteritems(): style_node.setAttribute('style:%s' % k, v) if style_properties: style_prop = self.content.createElement('style:text-properties') for k, v in style_properties.iteritems(): style_prop.setAttribute('%s' % k, v) style_node.appendChild(style_prop) return auto_styles.appendChild(style_node) def markdown_filter(self, markdown_text): """ Convert a markdown text into a ODT formated text """ if not isinstance(markdown_text, basestring): return '' from xml.dom import Node from markdown_map import transform_map try: from markdown2 import markdown except ImportError: raise SecretaryError('Could not import markdown2 library. Install it using "pip install markdown2"') styles_cache = {} # cache styles searching html_text = markdown(markdown_text) xml_object = parseString('%s' % html_text.encode('ascii', 'xmlcharrefreplace')) # Transform HTML tags as specified in transform_map # Some tags may require extra attributes in ODT. # Additional attributes are indicated in the 'attributes' property for tag in transform_map: html_nodes = xml_object.getElementsByTagName(tag) for html_node in html_nodes: odt_node = xml_object.createElement(transform_map[tag]['replace_with']) # Transfer child nodes if html_node.hasChildNodes(): for child_node in html_node.childNodes: odt_node.appendChild(child_node.cloneNode(True)) # Add style-attributes defined in transform_map if 'style_attributes' in transform_map[tag]: for k, v in transform_map[tag]['style_attributes'].iteritems(): odt_node.setAttribute('text:%s' % k, v) # Add defined attributes if 'attributes' in transform_map[tag]: for k, v in transform_map[tag]['attributes'].iteritems(): odt_node.setAttribute(k, v) # copy original href attribute in tag if tag == 'a': if html_node.hasAttribute('href'): odt_node.setAttribute('xlink:href', html_node.getAttribute('href')) # Does the node need to create an style? if 'style' in transform_map[tag]: name = transform_map[tag]['style']['name'] if not name in styles_cache: style_node = self.get_style_by_name(name) if style_node is None: # Create and cache the style node style_node = self.insert_style_in_content( name, transform_map[tag]['style'].get('attributes', None), **transform_map[tag]['style']['properties']) styles_cache[name] = style_node html_node.parentNode.replaceChild(odt_node, html_node) def node_to_string(node): result = node.toxml() # linebreaks in preformated nodes should be converted to if (node.__class__.__name__ != 'Text') and \ (node.getAttribute('text:style-name') == 'Preformatted_20_Text'): result = result.replace('\n', '') # All double linebreak should be replaced with an empty paragraph return result.replace('\n\n', '') return ''.join(node_as_str for node_as_str in map(node_to_string, xml_object.getElementsByTagName('html')[0].childNodes)) def render_template(template, **kwargs): """ Render a ODF template file """ engine = Renderer(file) return engine.render(**kwargs) if __name__ == "__main__": import os from datetime import datetime def read(fname): return open(os.path.join(os.path.dirname(__file__), fname)).read() document = { 'datetime': datetime.now(), 'md_sample': read('README.md') } countries = [ {'country': 'United States', 'capital': 'Washington', 'cities': ['miami', 'new york', 'california', 'texas', 'atlanta']}, {'country': 'England', 'capital': 'London', 'cities': ['gales']}, {'country': 'Japan', 'capital': 'Tokio', 'cities': ['hiroshima', 'nagazaki']}, {'country': 'Nicaragua', 'capital': 'Managua', 'cities': ['leon', 'granada', 'masaya']}, {'country': 'Argentina', 'capital': 'Buenos aires'}, {'country': 'Chile', 'capital': 'Santiago'}, {'country': 'Mexico', 'capital': 'MExico City', 'cities': ['puebla', 'cancun']}, ] render = Renderer('simple_template.odt') result = render.render(countries=countries, document=document) output = open('rendered.odt', 'wb') output.write(result) print("Template rendering finished! Check rendered.odt file.")