.. Copyright (C) 2001-2012 NLTK Project
.. For license information, see LICENSE.TXT

==========
 Chunking
==========

    >>> from nltk.chunk import *
    >>> from nltk.chunk.util import *
    >>> from nltk.chunk.regexp import *
    >>> from nltk import Tree
    >>> from nltk.test.doctest_utils import *

    >>> tagged_text = "[ The/DT cat/NN ] sat/VBD on/IN [ the/DT mat/NN ] [ the/DT dog/NN ] chewed/VBD ./."
    >>> gold_chunked_text = tagstr2tree(tagged_text)
    >>> unchunked_text = gold_chunked_text.flatten()

Chunking uses a special regexp syntax for rules that delimit the chunks. These
rules must be converted to 'regular' regular expressions before a sentence can
be chunked.

    >>> tag_pattern = "<DT>?<JJ>*<NN.*>"
    >>> regexp_pattern = tag_pattern2re_pattern(tag_pattern)
    >>> regexp_pattern
    '(<(DT)>)?(<(JJ)>)*(<(NN[^\\{\\}<>]*)>)'

Construct some new chunking rules.

    >>> chunk_rule = ChunkRule("<.*>+", "Chunk everything")
    >>> chink_rule = ChinkRule("<VBD|IN|\.>", "Chink on verbs/prepositions")
    >>> split_rule = SplitRule("<DT><NN>", "<DT><NN>",
    ...                        "Split successive determiner/noun pairs")


Create and score a series of chunk parsers, successively more complex.

    >>> chunk_parser = RegexpChunkParser([chunk_rule], chunk_node='NP')
    >>> chunked_text = chunk_parser.parse(unchunked_text)
    >>> print chunked_text
    (S
      (NP
        The/DT
        cat/NN
        sat/VBD
        on/IN
        the/DT
        mat/NN
        the/DT
        dog/NN
        chewed/VBD
        ./.))

    >>> chunkscore = ChunkScore()
    >>> chunkscore.score(gold_chunked_text, chunked_text)
    >>> chunkscore.precision()
    0.0

    >>> chunkscore.recall()
    0.0

    >>> chunkscore.f_measure()
    0

    >>> for chunk in chunkscore.missed(): print chunk
    (NP The/DT cat/NN)
    (NP the/DT mat/NN)
    (NP the/DT dog/NN)

    >>> for chunk in chunkscore.incorrect(): print chunk
    (NP
      The/DT
      cat/NN
      sat/VBD
      on/IN
      the/DT
      mat/NN
      the/DT
      dog/NN
      chewed/VBD
      ./.)

    >>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule],
    ...                                  chunk_node='NP')
    >>> chunked_text = chunk_parser.parse(unchunked_text)
    >>> print chunked_text
    (S
      (NP The/DT cat/NN)
      sat/VBD
      on/IN
      (NP the/DT mat/NN the/DT dog/NN)
      chewed/VBD
      ./.)
    >>> assert chunked_text == chunk_parser.parse(list(unchunked_text))

    >>> chunkscore = ChunkScore()
    >>> chunkscore.score(gold_chunked_text, chunked_text)
    >>> chunkscore.precision()
    0.5

    >>> float_equal(chunkscore.recall(), 1.0/3)
    True

    >>> float_equal(chunkscore.f_measure(), 0.4)
    True

    >>> for chunk in chunkscore.missed(): print chunk
    (NP the/DT mat/NN)
    (NP the/DT dog/NN)

    >>> for chunk in chunkscore.incorrect(): print chunk
    (NP the/DT mat/NN the/DT dog/NN)

    >>> chunk_parser = RegexpChunkParser([chunk_rule, chink_rule, split_rule],
    ...                                  chunk_node='NP')
    >>> chunked_text = chunk_parser.parse(unchunked_text, trace=True)
    # Input:
     <DT>  <NN>  <VBD>  <IN>  <DT>  <NN>  <DT>  <NN>  <VBD>  <.>
    # Chunk everything:
    {<DT>  <NN>  <VBD>  <IN>  <DT>  <NN>  <DT>  <NN>  <VBD>  <.>}
    # Chink on verbs/prepositions:
    {<DT>  <NN>} <VBD>  <IN> {<DT>  <NN>  <DT>  <NN>} <VBD>  <.>
    # Split successive determiner/noun pairs:
    {<DT>  <NN>} <VBD>  <IN> {<DT>  <NN>}{<DT>  <NN>} <VBD>  <.>
    >>> print chunked_text
    (S
      (NP The/DT cat/NN)
      sat/VBD
      on/IN
      (NP the/DT mat/NN)
      (NP the/DT dog/NN)
      chewed/VBD
      ./.)

    >>> chunkscore = ChunkScore()
    >>> chunkscore.score(gold_chunked_text, chunked_text)
    >>> chunkscore.precision()
    1.0

    >>> chunkscore.recall()
    1.0

    >>> chunkscore.f_measure()
    1.0

    >>> chunkscore.missed()
    []

    >>> chunkscore.incorrect()
    []

    >>> chunk_parser.rules() # doctest: +NORMALIZE_WHITESPACE
    [<ChunkRule: '<.*>+'>, <ChinkRule: '<VBD|IN|\\.>'>,
     <SplitRule: '<DT><NN>', '<DT><NN>'>]

Printing parsers:

    >>> print repr(chunk_parser)
    <RegexpChunkParser with 3 rules>
    >>> print chunk_parser
    RegexpChunkParser with 3 rules:
        Chunk everything
          <ChunkRule: '<.*>+'>
        Chink on verbs/prepositions
          <ChinkRule: '<VBD|IN|\\.>'>
        Split successive determiner/noun pairs
          <SplitRule: '<DT><NN>', '<DT><NN>'>

Regression Tests
~~~~~~~~~~~~~~~~
ChunkParserI
------------
`ChunkParserI` is an abstract interface -- it is not meant to be
instantiated directly.

    >>> ChunkParserI().parse([])
    Traceback (most recent call last):
      . . .
    NotImplementedError


ChunkString
-----------
ChunkString can be built from a tree of tagged tuples, a tree of
trees, or a mixed list of both:

    >>> t1 = Tree('S', [('w%d' % i, 't%d' % i) for i in range(10)])
    >>> t2 = Tree('S', [Tree('t0', []), Tree('t1', ['c1'])])
    >>> t3 = Tree('S', [('w0', 't0'), Tree('t1', ['c1'])])
    >>> ChunkString(t1)
    <ChunkString: '<t0><t1><t2><t3><t4><t5><t6><t7><t8><t9>'>
    >>> ChunkString(t2)
    <ChunkString: '<t0><t1>'>
    >>> ChunkString(t3)
    <ChunkString: '<t0><t1>'>

Other values generate an error:

    >>> ChunkString(Tree('S', ['x']))
    Traceback (most recent call last):
      . . .
    ValueError: chunk structures must contain tagged tokens or trees

The `str()` for a chunk string adds spaces to it, which makes it line
up with `str()` output for other chunk strings over the same
underlying input.

    >>> cs = ChunkString(t1)
    >>> print cs
     <t0>  <t1>  <t2>  <t3>  <t4>  <t5>  <t6>  <t7>  <t8>  <t9>
    >>> cs.xform('<t3>', '{<t3>}')
    >>> print cs
     <t0>  <t1>  <t2> {<t3>} <t4>  <t5>  <t6>  <t7>  <t8>  <t9>

The `_verify()` method makes sure that our transforms don't corrupt
the chunk string.  By setting debug_level=2, `_verify()` will be
called at the end of every call to `xform`.

    >>> cs = ChunkString(t1, debug_level=3)

    >>> # tag not marked with <...>:
    >>> cs.xform('<t3>', 't3')
    Traceback (most recent call last):
      . . .
    ValueError: Transformation generated invalid chunkstring:
      <t0><t1><t2>t3<t4><t5><t6><t7><t8><t9>

    >>> # brackets not balanced:
    >>> cs.xform('<t3>', '{<t3>')
    Traceback (most recent call last):
      . . .
    ValueError: Transformation generated invalid chunkstring:
      <t0><t1><t2>{<t3><t4><t5><t6><t7><t8><t9>

    >>> # nested brackets:
    >>> cs.xform('<t3><t4><t5>', '{<t3>{<t4>}<t5>}')
    Traceback (most recent call last):
      . . .
    ValueError: Transformation generated invalid chunkstring:
      <t0><t1><t2>{<t3>{<t4>}<t5>}<t6><t7><t8><t9>

    >>> # modified tags:
    >>> cs.xform('<t3>', '<t9>')
    Traceback (most recent call last):
      . . .
    ValueError: Transformation generated invalid chunkstring: tag changed

    >>> # added tags:
    >>> cs.xform('<t9>', '<t9><t10>')
    Traceback (most recent call last):
      . . .
    ValueError: Transformation generated invalid chunkstring: tag changed

Chunking Rules
--------------

Test the different rule constructors & __repr__ methods:

    >>> r1 = RegexpChunkRule('<a|b>'+ChunkString.IN_CHINK_PATTERN,
    ...                      '{<a|b>}', 'chunk <a> and <b>')
    >>> r2 = RegexpChunkRule(re.compile('<a|b>'+ChunkString.IN_CHINK_PATTERN),
    ...                      '{<a|b>}', 'chunk <a> and <b>')
    >>> r3 = ChunkRule('<a|b>', 'chunk <a> and <b>')
    >>> r4 = ChinkRule('<a|b>', 'chink <a> and <b>')
    >>> r5 = UnChunkRule('<a|b>', 'unchunk <a> and <b>')
    >>> r6 = MergeRule('<a>', '<b>', 'merge <a> w/ <b>')
    >>> r7 = SplitRule('<a>', '<b>', 'split <a> from <b>')
    >>> r8 = ExpandLeftRule('<a>', '<b>', 'expand left <a> <b>')
    >>> r9 = ExpandRightRule('<a>', '<b>', 'expand right <a> <b>')
    >>> for rule in r1, r2, r3, r4, r5, r6, r7, r8, r9:
    ...     print rule
    <RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
    <RegexpChunkRule: '<a|b>(?=[^\\}]*(\\{|$))'->'{<a|b>}'>
    <ChunkRule: '<a|b>'>
    <ChinkRule: '<a|b>'>
    <UnChunkRule: '<a|b>'>
    <MergeRule: '<a>', '<b>'>
    <SplitRule: '<a>', '<b>'>
    <ExpandLeftRule: '<a>', '<b>'>
    <ExpandRightRule: '<a>', '<b>'>

`tag_pattern2re_pattern()` complains if the tag pattern looks problematic:

    >>> tag_pattern2re_pattern('{}')
    Traceback (most recent call last):
      . . .
    ValueError: Bad tag pattern: '{}'

RegexpChunkParser
-----------------

A warning is printed when parsing an empty sentence:

    >>> parser = RegexpChunkParser([ChunkRule('<a>', '')])
    >>> parser.parse(Tree('S', []))
    Warning: parsing empty text
    Tree('S', [])

RegexpParser
------------

    >>> parser = RegexpParser('''
    ... NP: {<DT>? <JJ>* <NN>*} # NP
    ... P: {<IN>}           # Preposition
    ... V: {<V.*>}          # Verb
    ... PP: {<P> <NP>}      # PP -> P NP
    ... VP: {<V> <NP|PP>*}  # VP -> V (NP|PP)*
    ... ''')
    >>> print repr(parser)
    <chunk.RegexpParser with 5 stages>
    >>> print parser
    chunk.RegexpParser with 5 stages:
    RegexpChunkParser with 1 rules:
        NP   <ChunkRule: '<DT>? <JJ>* <NN>*'>
    RegexpChunkParser with 1 rules:
        Preposition   <ChunkRule: '<IN>'>
    RegexpChunkParser with 1 rules:
        Verb   <ChunkRule: '<V.*>'>
    RegexpChunkParser with 1 rules:
        PP -> P NP   <ChunkRule: '<P> <NP>'>
    RegexpChunkParser with 1 rules:
        VP -> V (NP|PP)*   <ChunkRule: '<V> <NP|PP>*'>
    >>> print parser.parse(unchunked_text, trace=True)
    # Input:
     <DT>  <NN>  <VBD>  <IN>  <DT>  <NN>  <DT>  <NN>  <VBD>  <.>
    # NP:
    {<DT>  <NN>} <VBD>  <IN> {<DT>  <NN>}{<DT>  <NN>} <VBD>  <.>
    # Input:
     <NP>  <VBD>  <IN>  <NP>  <NP>  <VBD>  <.>
    # Preposition:
     <NP>  <VBD> {<IN>} <NP>  <NP>  <VBD>  <.>
    # Input:
     <NP>  <VBD>  <P>  <NP>  <NP>  <VBD>  <.>
    # Verb:
     <NP> {<VBD>} <P>  <NP>  <NP> {<VBD>} <.>
    # Input:
     <NP>  <V>  <P>  <NP>  <NP>  <V>  <.>
    # PP -> P NP:
     <NP>  <V> {<P>  <NP>} <NP>  <V>  <.>
    # Input:
     <NP>  <V>  <PP>  <NP>  <V>  <.>
    # VP -> V (NP|PP)*:
     <NP> {<V>  <PP>  <NP>}{<V>} <.>
    (S
      (NP The/DT cat/NN)
      (VP
        (V sat/VBD)
        (PP (P on/IN) (NP the/DT mat/NN))
        (NP the/DT dog/NN))
      (VP (V chewed/VBD))
      ./.)

Test parsing of other rule types:

    >>> print RegexpParser('''
    ... X:
    ...   }<a><b>{     # chink rule
    ...   <a>}{<b>     # split rule
    ...   <a>{}<b>     # merge rule
    ...   <a>{<b>}<c>  # chunk rule w/ context
    ... ''')
    chunk.RegexpParser with 1 stages:
    RegexpChunkParser with 4 rules:
        chink rule              <ChinkRule: '<a><b>'>
        split rule              <SplitRule: '<a>', '<b>'>
        merge rule              <MergeRule: '<a>', '<b>'>
        chunk rule w/ context   <ChunkRuleWithContext: '<a>', '<b>', '<c>'>

Illegal patterns give an error message:

    >>> print RegexpParser('X: {<foo>} {<bar>}')
    Traceback (most recent call last):
      . . .
    ValueError: Illegal chunk pattern: {<foo>} {<bar>}