# -*- coding: utf-8 -*-

# (C) 2008  DFKI Language Technology Lab http://www.dfki.de/lt
#     Project HyLaP;  Author: Torsten Marek
# 
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License 2.1 as published by the Free Software Foundation.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# Lesser General Public License for more details.
# 
# You should have received a copy of the GNU Lesser General Public
# License along with this library; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307
# USA


from __future__ import division

import os
import sgmllib
import re

from pytake.compat import *
from pytake.utils import filter_chain, join, most_frequent_item
from pytake.text_utils import *
from pytake.dom import Article
from pytake.etree_utils import IterParseHandler, element_handler

INPUT_ENCODING = "UTF-8"
OUTPUT_ENCODING = "UTF-8"

CHAR_CONVERSIONS = {
    u"ﬁ" : "fi",
    u"ﬀ" : "ff",
    u"ﬃ" : "ffi",
    u"ﬂ" : "fl",
    u"–": "-", # EN DASH     
    u"—": "-", # EM DASH     # all mapped to HYPHEN-MINUS
    u"−": "-", # MINUS SIGN
    u"’": "'",
    u"‘": "'",
    u"”": '"',
    u"“": '"',
    u"…" : "...",
    u"\xad" : "", # SOFT HYPHEN
    u" ": " " # NO BREAK SPACE
    }

def convert_chars(s):
    return "".join(CHAR_CONVERSIONS.get(c, c) for c in s)

class BadExtractionError(Exception):
    pass


class Page(object):
    def __init__(self, idx):
        self.idx = idx
        self.boxes = []
        
    def compute_extents(self):
        self.width = max([b.lowright[0] for b in self.boxes])
        self.height = max([b.lowright[1] for b in self.boxes])
        
                 
MAX_HORIZ_MERGE_DISTANCE = 15
BOX_GRANULARITY = 30
class TextBox(object):
    __slots__ = ("text", "upleft", "lowright", "type", "page", "fontsize")
    
    def __init__(self, upleft, lowright, page):
        self.page = page
        self.text = u""
        self.upleft = upleft
        self.lowright = lowright

    def get_normalized_text(self, wordlist):
        return convert_chars(unhyphenize(self.text.split("\n"), wordlist))

    def copy(self):
        c = TextBox(self.upleft, self.lowright, self.page)
        c.text = self.text
        return c

    def get_rounded_width(self):
        return int((self.lowright[0] - self.upleft[0]) // BOX_GRANULARITY * BOX_GRANULARITY)
    
    def merge(box_a, box_b, connector= " "):
        b = TextBox(box_a.upleft, box_b.lowright, box_a.page)
        b.text = connector.join([box_a.text.rstrip(), box_b.text.lstrip()])
        b.type = box_a.type
        b.fontsize = box_a.fontsize
        return b
    merge = staticmethod(merge)

    def adjacent(box_a, box_b):
        return box_b.upleft[0] - box_a.lowright[0] < MAX_HORIZ_MERGE_DISTANCE \
               and box_a.upleft[1] == box_b.upleft[1] and box_a.fontsize == box_b.fontsize
    adjacent = staticmethod(adjacent)

    def same_paragraph(box_a, box_b):
        if box_a.fontsize != box_b.fontsize:
            return False

        if not is_complete_paragraph(box_a.text):
            if box_a.page.idx == box_b.page.idx - 1:
                return True
    
            middle = box_a.page.width / 2
            if box_a.page.idx == box_b.page.idx and box_a.upleft[0] < middle and box_b.upleft[0] > middle:
                return True

        return False
    same_paragraph = staticmethod(same_paragraph)
    

class _test_method_collector(type):
    def __new__(cls, name, bases, dict):
        filters = [v
                   for name, v in dict.iteritems()
                   if name.startswith("test_") and callable(v)]
        dict["FILTERS"] = filters
        return type.__new__(cls, name, bases, dict)

MIN_LINES = 5
class JunkDetector(object):
    __metaclass__ = _test_method_collector
    def __init__(self, fontsizes, wordlist):
        self._default_fontsize = float(most_frequent_item(fontsizes))
        self._wordlist = wordlist
        
        # contains at least n words
        # longer than x
        # normal width
        # more than one line
        # single line, starts with number
        # boxes that are inside other boxes


    def test_fontsize(self, box):
        delta = float(box.fontsize) - self._default_fontsize
        if delta >= -.2:
            return 1
        else:
            if delta < -2:
                return -2
            else:
                return -1
        
    def test_multiline(self, box):
        if len(box.text.split("\n", MIN_LINES)) < MIN_LINES:
            return 0
        else:
            return 1
        
    def test_very_short_boxes(self, box):
        if len(box.text) < 10:
            return 0
        else:
            return 0
        
    def is_junk_box(self, box):
        weight = 0
        for f in JunkDetector.FILTERS:
            weight += f(self, box)
            # check for certain watermarks
        return weight < 0

    
def _make_column_comparator(middle):
    def compare(box_a, box_b):
        side_a = box_a.upleft[0] < middle
        side_b = box_b.upleft[0] < middle
        if side_a == side_b:
            return cmp(box_a.upleft[1], box_b.upleft[1])
        else:
            return cmp(not side_a, not side_b)

    return compare


def is_author(text):
    t = text.strip().lower().split("\n")
    return "@" in t[-1]

def is_author_box(box, *args):
    return box.page.idx == 1 and is_author(box.text)

    
class PdfBoxOutputParser(IterParseHandler):
    caption_filter = re.compile(r"\s*(Table|Figure) \d+[:.]", re.UNICODE)
    # parsing methods
    
    def __init__(self, wordlist = None):
        self._wordlist = wordlist
        self.pages = []
        self._current_page = None
        self._current_text = None
        self._collect = False
        self._pagecount = 0
        self._fontsizes = {}
        self._widths = {}
        self._good_blocks = 0
    
    @element_handler("page", event = "start")
    def start_page(self, elem):
        self._pagecount += 1
        self._current_page = Page(self._pagecount)
    
    @element_handler("page", event = "end")
    def end_page(self, elem):
        if len(self._current_page.boxes) > 1:
            self.pages.append(self._current_page)
            self._current_page.compute_extents()

        if self._pagecount > 4:
            if self._get_default_fontsize().startswith("-"):
                raise BadExtractionError
                                                    
    @element_handler("text-box", event = "start")
    def start_text_box(self, elem):
        left, right = float(elem.get("left")), float(elem.get("right"))
        self._current_text = TextBox(
            (left, float(elem.get("top"))),
            (right, float(elem.get("bottom"))),
            self._current_page)
        self._widths.setdefault(self._current_text.get_rounded_width(), 0)
        
    @element_handler("text-group", event = "start")
    def start_text_group(self, elem):
        self._collect = True

        self._current_text.type = elem.get("text-type")
        self._current_text.fontsize = elem.get("fontsize")
        self._fontsizes.setdefault(elem.get("fontsize"), 0)
    
    @element_handler("text-group", event = "end")
    def end_text_group(self, elem):
        if self._collect:
            data = elem.text.strip() if elem.text else ""
            self._fontsizes[self._current_text.fontsize] += len(data)
            self._widths[self._current_text.get_rounded_width()] += len(data)
            self._current_text.text += data
            
        self._collect = False
        if not self._current_text.fontsize.startswith("-") and len(self._current_text.text) > 0:
            self._current_page.boxes.append(self._current_text)

        self._current_text = self._current_text.copy()    
        self._current_text.text = ""
        
                
    # conversion/extraction
    def _get_default_fontsize(self):
        # the font size with the most characters will be assumed to be the
        # default font size
        return most_frequent_item(self._fontsizes)
        
    def to_article(self, bibtex_file = None):
        self.art = Article(bibtex_file)

        target = self.art.main_text
        single_par = False

        special = {}
        for idx, box in enumerate(self._get_boxes(special)):
            if idx == 0:
                self.art.title = box.get_normalized_text(self._wordlist)
            else:
                t = box.text.lower().strip()
                if t.find("\n") == -1 or box.type == "title_candidate":
                    swallow, target, single_par = self.art.get_target(t)
                    if swallow:
                        continue
                
                target.append((box.get_normalized_text(self._wordlist), box.page.idx))
                if single_par:
                    target = self.art.main_text
        
        self.art.authors = [convert_chars(t.text.split("\n")[0]) for t in special.get("author", [])]

        if len(self.art.main_text) == 0:
            raise BadExtractionError
        
        return self.art
    

    SPECIAL_FILTERS = [
        ("author", is_author_box),
        ("caption", lambda box: PdfBoxOutputParser.caption_filter.match(box.text) is not None)
        ]

    #features
    #- previous box has larger font
    #- starts at left border
    #- next box is in next column or on next page

    def _filter_special_boxes(self, boxes, special_boxes):
        for box in boxes:
            for box_type, box_filter in self.SPECIAL_FILTERS:
                if box_filter(box):
                    special_boxes.setdefault(box_type, []).append(box)
                    break
            else:
                yield box

    EARLY_JUNK_FILTERS = [
        # page number detector
        lambda b: b.upleft[1] > b.page.height * 0.9 and b.text.strip().isdigit(),
        ]

    def _get_all_boxes(self, special_boxes):
        for page in self.pages:
            # workflow
            # run early junk filters on unmerged boxes
            boxes = filter_chain(self.EARLY_JUNK_FILTERS, page.boxes)

            # first merge run, see if we can already merge some boxes
            boxes = join(boxes, TextBox.adjacent, TextBox.merge)

            # sort the boxes, assume 2-column layout. This screws up
            # the title on the first page
            boxes = sorted(boxes, _make_column_comparator(page.width / 2))

            # 2nd merge run, now all adjacent boxes should be merged
            boxes = join(boxes, TextBox.adjacent, TextBox.merge)

            # filter out special boxes that should not appear in the main text,
            # like captions, authors, (footnotes!)
            boxes = self._filter_special_boxes(boxes, special_boxes)
            
            junk = JunkDetector(self._fontsizes, self._wordlist)
            for box in boxes:
                if not junk.is_junk_box(box):
                    yield box

    def _get_boxes(self, special_boxes):
        # join paragraphs over column and page breaks
        boxes = join(
            self._get_all_boxes(special_boxes),
            TextBox.same_paragraph,
            lambda a, b: TextBox.merge(a, b, "\n"))
        return boxes