# (C) 2008 DFKI Language Technology Lab http://www.dfki.de/lt # Project HyLaP; Author: Torsten Marek # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License 2.1 as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA from __future__ import division import os from pytake.etree_utils import write_tree from pdfboxparser import BadExtractionError, PdfBoxOutputParser class TextExtractor(object): PATH = "TextExtractor/textextract" def __init__(self, keep = False, cache_dir = ""): self._keep = keep self._cache_dir = cache_dir def extract(self, pdf_file): if self._keep: self.tmpfile_name = os.path.join(self._cache_dir, "%s.raw" % (os.path.splitext(os.path.split(pdf_file)[1])[0],)) else: self.tmpfile_name = os.tempnam() if not os.path.exists(self.tmpfile_name): os.system("%s -encoding UTF-8 %s %s" % (TextExtractor.PATH, pdf_file, self.tmpfile_name)) return self.tmpfile_name def cleanup(self): if os.path.exists(self.tmpfile_name) and not self._keep: os.unlink(self.tmpfile_name) def convert_to_article(pdf_file, output_file, bibtex_file = None, keep = False, cache_dir = "", wordlist = None): extractor = TextExtractor(keep = keep, cache_dir = cache_dir) pdf_parser = PdfBoxOutputParser(wordlist) try: pdf_parser.parse(extractor.extract(pdf_file)) finally: extractor.cleanup() write_tree(pdf_parser.to_article(bibtex_file).to_xml(), output_file)