# (C) 2008  DFKI Language Technology Lab http://www.dfki.de/lt
#     Project HyLaP;  Author: Torsten Marek
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License 2.1 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
# USA

from __future__ import division
import os

from pytake.etree_utils import write_tree

from pdfboxparser import BadExtractionError, PdfBoxOutputParser

class TextExtractor(object):
    PATH = "TextExtractor/textextract"
    def __init__(self, keep = False, cache_dir = ""):
        self._keep = keep
        self._cache_dir = cache_dir

    def extract(self, pdf_file):
        if self._keep:
            self.tmpfile_name = os.path.join(self._cache_dir, 
                                        "%s.raw" % (os.path.splitext(os.path.split(pdf_file)[1])[0],))
        else:
            self.tmpfile_name = os.tempnam()

        if not os.path.exists(self.tmpfile_name):
            os.system("%s -encoding UTF-8 %s %s" % (TextExtractor.PATH, pdf_file, self.tmpfile_name))
        return self.tmpfile_name

    def cleanup(self):
        if os.path.exists(self.tmpfile_name) and not self._keep:
            os.unlink(self.tmpfile_name)


def convert_to_article(pdf_file, output_file, bibtex_file = None,
                       keep = False, cache_dir = "", wordlist = None):
    extractor = TextExtractor(keep = keep, cache_dir = cache_dir)
    pdf_parser = PdfBoxOutputParser(wordlist)
    try:
        pdf_parser.parse(extractor.extract(pdf_file))
    finally:
        extractor.cleanup()
        
    write_tree(pdf_parser.to_article(bibtex_file).to_xml(), output_file)