# -*- coding: utf-8 -*- # (C) 2008 DFKI Language Technology Lab http://www.dfki.de/lt # Project HyLaP; Author: Torsten Marek # # This program is free software; you can redistribute it and/or # modify it under the terms of the GNU Lesser General Public # License 2.1 as published by the Free Software Foundation. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # Lesser General Public License for more details. # # You should have received a copy of the GNU Lesser General Public # License along with this library; if not, write to the Free Software # Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 # USA from __future__ import division import os import sgmllib import re from pytake.compat import * from pytake.utils import filter_chain, join, most_frequent_item from pytake.text_utils import * from pytake.dom import Article from pytake.etree_utils import IterParseHandler, element_handler INPUT_ENCODING = "UTF-8" OUTPUT_ENCODING = "UTF-8" CHAR_CONVERSIONS = { u"fi" : "fi", u"ff" : "ff", u"ffi" : "ffi", u"fl" : "fl", u"–": "-", # EN DASH u"—": "-", # EM DASH # all mapped to HYPHEN-MINUS u"−": "-", # MINUS SIGN u"’": "'", u"‘": "'", u"”": '"', u"“": '"', u"…" : "...", u"\xad" : "", # SOFT HYPHEN u" ": " " # NO BREAK SPACE } def convert_chars(s): return "".join(CHAR_CONVERSIONS.get(c, c) for c in s) class BadExtractionError(Exception): pass class Page(object): def __init__(self, idx): self.idx = idx self.boxes = [] def compute_extents(self): self.width = max([b.lowright[0] for b in self.boxes]) self.height = max([b.lowright[1] for b in self.boxes]) MAX_HORIZ_MERGE_DISTANCE = 15 BOX_GRANULARITY = 30 class TextBox(object): __slots__ = ("text", "upleft", "lowright", "type", "page", "fontsize") def __init__(self, upleft, lowright, page): self.page = page self.text = u"" self.upleft = upleft self.lowright = lowright def get_normalized_text(self, wordlist): return convert_chars(unhyphenize(self.text.split("\n"), wordlist)) def copy(self): c = TextBox(self.upleft, self.lowright, self.page) c.text = self.text return c def get_rounded_width(self): return int((self.lowright[0] - self.upleft[0]) // BOX_GRANULARITY * BOX_GRANULARITY) def merge(box_a, box_b, connector= " "): b = TextBox(box_a.upleft, box_b.lowright, box_a.page) b.text = connector.join([box_a.text.rstrip(), box_b.text.lstrip()]) b.type = box_a.type b.fontsize = box_a.fontsize return b merge = staticmethod(merge) def adjacent(box_a, box_b): return box_b.upleft[0] - box_a.lowright[0] < MAX_HORIZ_MERGE_DISTANCE \ and box_a.upleft[1] == box_b.upleft[1] and box_a.fontsize == box_b.fontsize adjacent = staticmethod(adjacent) def same_paragraph(box_a, box_b): if box_a.fontsize != box_b.fontsize: return False if not is_complete_paragraph(box_a.text): if box_a.page.idx == box_b.page.idx - 1: return True middle = box_a.page.width / 2 if box_a.page.idx == box_b.page.idx and box_a.upleft[0] < middle and box_b.upleft[0] > middle: return True return False same_paragraph = staticmethod(same_paragraph) class _test_method_collector(type): def __new__(cls, name, bases, dict): filters = [v for name, v in dict.iteritems() if name.startswith("test_") and callable(v)] dict["FILTERS"] = filters return type.__new__(cls, name, bases, dict) MIN_LINES = 5 class JunkDetector(object): __metaclass__ = _test_method_collector def __init__(self, fontsizes, wordlist): self._default_fontsize = float(most_frequent_item(fontsizes)) self._wordlist = wordlist # contains at least n words # longer than x # normal width # more than one line # single line, starts with number # boxes that are inside other boxes def test_fontsize(self, box): delta = float(box.fontsize) - self._default_fontsize if delta >= -.2: return 1 else: if delta < -2: return -2 else: return -1 def test_multiline(self, box): if len(box.text.split("\n", MIN_LINES)) < MIN_LINES: return 0 else: return 1 def test_very_short_boxes(self, box): if len(box.text) < 10: return 0 else: return 0 def is_junk_box(self, box): weight = 0 for f in JunkDetector.FILTERS: weight += f(self, box) # check for certain watermarks return weight < 0 def _make_column_comparator(middle): def compare(box_a, box_b): side_a = box_a.upleft[0] < middle side_b = box_b.upleft[0] < middle if side_a == side_b: return cmp(box_a.upleft[1], box_b.upleft[1]) else: return cmp(not side_a, not side_b) return compare def is_author(text): t = text.strip().lower().split("\n") return "@" in t[-1] def is_author_box(box, *args): return box.page.idx == 1 and is_author(box.text) class PdfBoxOutputParser(IterParseHandler): caption_filter = re.compile(r"\s*(Table|Figure) \d+[:.]", re.UNICODE) # parsing methods def __init__(self, wordlist = None): self._wordlist = wordlist self.pages = [] self._current_page = None self._current_text = None self._collect = False self._pagecount = 0 self._fontsizes = {} self._widths = {} self._good_blocks = 0 @element_handler("page", event = "start") def start_page(self, elem): self._pagecount += 1 self._current_page = Page(self._pagecount) @element_handler("page", event = "end") def end_page(self, elem): if len(self._current_page.boxes) > 1: self.pages.append(self._current_page) self._current_page.compute_extents() if self._pagecount > 4: if self._get_default_fontsize().startswith("-"): raise BadExtractionError @element_handler("text-box", event = "start") def start_text_box(self, elem): left, right = float(elem.get("left")), float(elem.get("right")) self._current_text = TextBox( (left, float(elem.get("top"))), (right, float(elem.get("bottom"))), self._current_page) self._widths.setdefault(self._current_text.get_rounded_width(), 0) @element_handler("text-group", event = "start") def start_text_group(self, elem): self._collect = True self._current_text.type = elem.get("text-type") self._current_text.fontsize = elem.get("fontsize") self._fontsizes.setdefault(elem.get("fontsize"), 0) @element_handler("text-group", event = "end") def end_text_group(self, elem): if self._collect: data = elem.text.strip() if elem.text else "" self._fontsizes[self._current_text.fontsize] += len(data) self._widths[self._current_text.get_rounded_width()] += len(data) self._current_text.text += data self._collect = False if not self._current_text.fontsize.startswith("-") and len(self._current_text.text) > 0: self._current_page.boxes.append(self._current_text) self._current_text = self._current_text.copy() self._current_text.text = "" # conversion/extraction def _get_default_fontsize(self): # the font size with the most characters will be assumed to be the # default font size return most_frequent_item(self._fontsizes) def to_article(self, bibtex_file = None): self.art = Article(bibtex_file) target = self.art.main_text single_par = False special = {} for idx, box in enumerate(self._get_boxes(special)): if idx == 0: self.art.title = box.get_normalized_text(self._wordlist) else: t = box.text.lower().strip() if t.find("\n") == -1 or box.type == "title_candidate": swallow, target, single_par = self.art.get_target(t) if swallow: continue target.append((box.get_normalized_text(self._wordlist), box.page.idx)) if single_par: target = self.art.main_text self.art.authors = [convert_chars(t.text.split("\n")[0]) for t in special.get("author", [])] if len(self.art.main_text) == 0: raise BadExtractionError return self.art SPECIAL_FILTERS = [ ("author", is_author_box), ("caption", lambda box: PdfBoxOutputParser.caption_filter.match(box.text) is not None) ] #features #- previous box has larger font #- starts at left border #- next box is in next column or on next page def _filter_special_boxes(self, boxes, special_boxes): for box in boxes: for box_type, box_filter in self.SPECIAL_FILTERS: if box_filter(box): special_boxes.setdefault(box_type, []).append(box) break else: yield box EARLY_JUNK_FILTERS = [ # page number detector lambda b: b.upleft[1] > b.page.height * 0.9 and b.text.strip().isdigit(), ] def _get_all_boxes(self, special_boxes): for page in self.pages: # workflow # run early junk filters on unmerged boxes boxes = filter_chain(self.EARLY_JUNK_FILTERS, page.boxes) # first merge run, see if we can already merge some boxes boxes = join(boxes, TextBox.adjacent, TextBox.merge) # sort the boxes, assume 2-column layout. This screws up # the title on the first page boxes = sorted(boxes, _make_column_comparator(page.width / 2)) # 2nd merge run, now all adjacent boxes should be merged boxes = join(boxes, TextBox.adjacent, TextBox.merge) # filter out special boxes that should not appear in the main text, # like captions, authors, (footnotes!) boxes = self._filter_special_boxes(boxes, special_boxes) junk = JunkDetector(self._fontsizes, self._wordlist) for box in boxes: if not junk.is_junk_box(box): yield box def _get_boxes(self, special_boxes): # join paragraphs over column and page breaks boxes = join( self._get_all_boxes(special_boxes), TextBox.same_paragraph, lambda a, b: TextBox.merge(a, b, "\n")) return boxes