# -*- coding: utf-8 -*-
"""
The document modules handles an Open XML document
"""
# $Id$
import os
import tempfile
import zipfile
import shutil
import fnmatch
import urllib.request, urllib.parse, urllib.error
import imghdr
import lxml
from . import contenttypes
from .namespaces import ns_map
from .utils import xmlFile
from .utils import toUnicode
[docs]class Document(object):
"""**Base class for handling Open XML document (all types)**
**Must** be subclassed for various types of documents (word processing, ...)
:param file_: An opened file(like) object of the document that must be opened in 'rb' mode
:param mime_type: the MIME type for the file, potentially found by :func:`openxmllib.openXmlDocument`
"""
#: A mapping like ``{glob-expr: mime-type, ...}`` must be overriden by subclasses
_extpattern_to_mime = {}
#: A sequence of extractor objects for text extraction must be overriden by subclasses
_text_extractors = []
def __init__(self, file_, mime_type=None):
"""**Creating a new document**
"""
#: The MIME type of the document
self.mime_type = mime_type
# Some shortcuts
op_sep = os.path.sep
op_join = os.path.join
op_isdir = os.path.isdir
op_dirname = os.path.dirname
# Preliminary settings depending on input
#: The file mane of the document
self.filename = getattr(file_, 'name', None)
if self.filename is None and mime_type is None:
raise ValueError("Cannot guess mime type from such object, you should use the mime_type constructor arg.")
# Need to make a real file for urllib.urlopen objects
if isinstance(file_, urllib.addinfourl):
fh, self._cache_file = tempfile.mkstemp()
fh = os.fdopen(fh, 'wb')
fh.write(file_.read())
fh.close()
file_.close()
file_ = open(self._cache_file, 'rb')
# Inflating the zipped file
self._cache_dir = tempfile.mkdtemp()
openxmldoc = zipfile.ZipFile(file_, 'r', zipfile.ZIP_DEFLATED)
for outpath in openxmldoc.namelist():
# Makes Windows path when under Windows
rel_outpath = op_sep.join(outpath.split('/'))
abs_outpath = op_join(self._cache_dir, rel_outpath)
abs_outdir = op_dirname(abs_outpath)
if not op_isdir(abs_outdir):
os.makedirs(abs_outdir)
fh = file(abs_outpath, 'wb')
fh.write(openxmldoc.read(outpath))
fh.close()
openxmldoc.close()
file_.close()
# Getting the content types declarations
ct_file = op_join(self._cache_dir, '[Content_Types].xml')
#: A :class:`openxmllib.contenttypes.ContentTypes` object for this document
self.content_types = contenttypes.ContentTypes(xmlFile(ct_file, 'rb'))
return
@property
def mimeType(self):
"""The official MIME type for this document, guessed from the extensions
of the :py:attr:`openxmllib.document.Document.filename` attribute, as
opposed to the :py:attr:`openxmllib.document.Document.mime_type`
attribute.
:return: ``application/xxx`` for this file
"""
if self.mime_type:
# Supposed validated by the factory
return self.mime_type
for pattern, mime_type in list(self._extpattern_to_mime.items()):
if fnmatch.fnmatch(self.filename, pattern):
return mime_type
@property
def coreProperties(self):
"""Document core properties (author, ...) similar to DublinCore
:return: mapping of standard metadata like ``{'title': 'blah', 'language': 'fr-FR', ...}``
"""
return self._tagValuedProperties(contenttypes.CT_CORE_PROPS)
@property
def extendedProperties(self):
"""Additional document automatic properties provided by the office app
:return: mapping of metadata like ``{'Pages': '14', ...}``
"""
return self._tagValuedProperties(contenttypes.CT_EXT_PROPS)
def _tagValuedProperties(self, content_type):
"""Document properties for property files having constructs like
<ns:name>value</ns:name>
:param content_type: ``contenttypes.CT_CORE_PROPS`` or ``contenttypes.CT_EXT_PROPS``
:return: mapping like {'property name': 'property value', ...}
"""
rval = {}
if not content_type in self.content_types.listMetaContentTypes:
# We fail silently
return rval
for tree in self.content_types.getTreesFor(self, content_type):
for elt in tree.getroot().getchildren():
tag = elt.tag.split('}')[-1] # Removing namespace if any
rval[toUnicode(tag)] = toUnicode(elt.text)
return rval
@property
def customProperties(self):
"""Document custom properties added by the document author.
We canot convert the properties as indicated
with the http://schemas.openxmlformats.org/officeDocument/2006/docPropsVTypes
namespace
:return: mapping of metadata
"""
rval = {}
if len(self.content_types.getPathsForContentType(contenttypes.CT_CUSTOM_PROPS)) == 0:
# We may have no custom properties at all.
return rval
XPath = lxml.etree.XPath # Class shortcut
properties_xpath = XPath('custom-properties:property', namespaces=ns_map)
propname_xpath = XPath('@name')
propvalue_xpath = XPath('*/text()')
for tree in self.content_types.getTreesFor(self, contenttypes.CT_CUSTOM_PROPS):
for elt in properties_xpath(tree.getroot()):
rval[toUnicode(propname_xpath(elt)[0])] = " ".join(propvalue_xpath(elt))
return rval
@property
def allProperties(self):
"""Helper that merges core, extended and custom properties
:return: mapping of all properties
"""
rval = {}
rval.update(self.coreProperties)
rval.update(self.extendedProperties)
rval.update(self.customProperties)
return rval
[docs] def documentCover(self):
"""Cover page image
:return: (file extension, file object) tuple.
"""
rels_pth = os.path.join(self._cache_dir, "_rels", ".rels")
rels_xml = lxml.etree.parse(xmlFile(rels_pth, 'rb'))
thumb_ns = ns_map["thumbnails"]
thumb_elm_xpr = "relationships:Relationship[@Type='%s']" % thumb_ns
rels_xpath = lxml.etree.XPath(thumb_elm_xpr, namespaces=ns_map)
try:
cover_path = rels_xpath(rels_xml)[0].attrib["Target"]
except IndexError:
return None
cover_fp = open(self._cache_dir + os.sep + cover_path, "rb")
cover_type = imghdr.what(None, h=cover_fp.read(32))
cover_fp.seek(0)
# some MS docs say the type can be JPEG which is ok,
# or WMF, which imghdr does not recognize...
if not cover_type:
cover_type = cover_path.split('.')[-1]
else:
cover_type = cover_type.replace("jpeg", "jpg")
return (cover_type, cover_fp)
[docs] def indexableText(self, include_properties=True):
"""Words found in the various texts of the document.
:param include_properties: Adds words from properties
:return: Space separated words of the document.
"""
text = set()
for extractor in self._text_extractors:
if extractor.content_type in self.content_types.overrides:
for tree in self.content_types.getTreesFor(self, extractor.content_type):
words = extractor.indexableText(tree)
text |= words
if include_properties:
for prop_value in list(self.allProperties.values()):
if prop_value is not None:
text.add(prop_value)
return ' '.join([word for word in text])
def __del__(self):
"""Cleanup at Document object deletion
"""
if hasattr(self, '_cache_dir'):
shutil.rmtree(self._cache_dir, ignore_errors=True)
if hasattr(self, '_cache_file'):
os.remove(self._cache_file)
return
[docs] @classmethod
def canProcessMime(cls, mime_type):
"""Check if we can process such mime type
:param mime_type: Mime type as 'application/xxx'
:return: True if we can process such mime
"""
supported_mimes = list(cls._extpattern_to_mime.values())
return mime_type in supported_mimes
[docs] @classmethod
def canProcessFilename(cls, filename):
"""Check if we can process such file based on name
:param filename: File name as 'mydoc.docx'
:return: True if we can process such file
"""
supported_patterns = list(cls._extpattern_to_mime.keys())
for pattern in supported_patterns:
if fnmatch.fnmatch(filename, pattern):
return True
return False