/* converter for HTML-entities HTML-entities from the input stream are converted to characters in UTF-8 or ISO-8859-1 (see options) This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA 02110-1301, USA. Written 2004 by Sebastian Nagel, CIS Uni München */ %option caseful 8bit batch noyywrap %{ #include #include #include #include typedef unsigned int BOOL; enum { FALSE = 0, TRUE = 1 }; enum { ISO_LATIN_1 = 1, UTF8 }; struct option { BOOL QUIET; unsigned char ENCODING; BOOL FORCE; BOOL Replace; unsigned int rChar; BOOL replace; char* rString; } option = { FALSE, /* (QUIET) default: report errors */ UTF8, /* default encoding */ FALSE, /* don't skip unrecognized/unprintable characters */ FALSE, /* don't replace unrecognized/unprintable characters */ 0x3f, /* question mark */ FALSE, /* don't replace unrecognized/unprintable characters */ "???" }; unsigned int num; /* buffer for value of numerical entities */ inline static char* num_to_utf8 (int num, char* out) { int i = 0; if (num <= 0x7f) { out[i++] = (unsigned char)(num); } else if (num <= 0x7ff) { /* 00000000 00000000 00000bbb bbaaaaaa ==> 110bbbbb 10aaaaaa */ out[i++] = (unsigned char)(0xc0 | (num >> 6)); out[i++] = (unsigned char)(0x80 | (num & 0x3f)); } else if (num <= 0xffff) { /* 00000000 00000000 ccccbbbb bbaaaaaa ==> 1110cccc 10bbbbbb 10aaaaaa */ out[i++] = (unsigned char)(0xe0 | (num >> 12)); out[i++] = (unsigned char)(0x80 | ((num >> 6) & 0x3f)); out[i++] = (unsigned char)(0x80 | (num & 0x3f)); } else if (num <= 0x10ffff) { /* 00000000 000dddcc ccccbbbb bbaaaaaa ==> 11110ddd 10cccccc 10bbbbbb 10aaaaaa */ out[i++] = (unsigned char)(0xf0 | (num >> 18)); out[i++] = (unsigned char)(0x80 | ((num >> 12) & 0x3f)); out[i++] = (unsigned char)(0x80 | ((num >> 6) & 0x3f)); out[i++] = (unsigned char)(0x80 | (num & 0x3f)); } else { /* characters > 0x10ffff are illegal in Unicode 4.0 */ if (option.QUIET != TRUE) fprintf(stderr, "Char 0x%x (\"%s\") not defined in unicode!\n", num, yytext); } out[i] = '\0'; return out; } inline static char* num_to_iso_latin_1 (int num, char* out) { int i = 0; if (num <= 0xff) out[i++] = (unsigned char)(num); else if (option.QUIET != TRUE) fprintf(stderr, "Char 0x%x (\"%s\") doesn't exist in ISO-8859-1: use UTF-8!\n", num, yytext); out[i] = '\0'; return out; } inline static void out (unsigned int num) { char out[8]; if ( option.ENCODING == UTF8 ) num_to_utf8(num, out); else if ( option.ENCODING == ISO_LATIN_1 ) num_to_iso_latin_1(num, out); fputs(out, yyout); } %} %% " out(0x22); /* quotation mark = APL quote */ & out(0x26); /* ampersand */ < out(0x3c); /* less-than sign */ > out(0x3e); /* greater-than sign */   out(0xa0); /* no-break space = non-breaking space */ ¡ out(0xa1); /* inverted exclamation mark */ ¢ out(0xa2); /* cent sign */ £ out(0xa3); /* pound sign */ ¤ out(0xa4); /* currency sign */ ¥ out(0xa5); /* yen sign = yuan sign */ ¦ out(0xa6); /* broken bar = broken vertical bar */ § out(0xa7); /* section sign */ ¨ out(0xa8); /* diaeresis = spacing diaeresis */ © out(0xa9); /* copyright sign */ ª out(0xaa); /* feminine ordinal indicator */ « out(0xab); /* left-pointing double angle quotation mark */ ¬ out(0xac); /* not sign */ ­ out(0xad); /* soft hyphen = discretionary hyphen */ ® out(0xae); /* registered sign = registered trade mark sign */ ¯ out(0xaf); /* macron = spacing macron = overline */ ° out(0xb0); /* degree sign */ ± out(0xb1); /* plus-minus sign = plus-or-minus sign */ ² out(0xb2); /* superscript two = superscript digit two */ ³ out(0xb3); /* superscript three = superscript digit three */ ´ out(0xb4); /* acute accent = spacing acute */ µ out(0xb5); /* micro sign */ ¶ out(0xb6); /* pilcrow sign = paragraph sign */ · out(0xb7); /* middle dot = Georgian comma */ ¸ out(0xb8); /* cedilla = spacing cedilla */ ¹ out(0xb9); /* superscript one = superscript digit one */ º out(0xba); /* masculine ordinal indicator */ » out(0xbb); /* right-pointing double angle quotation mark */ ¼ out(0xbc); /* vulgar fraction one quarter */ ½ out(0xbd); /* vulgar fraction one half */ ¾ out(0xbe); /* vulgar fraction three quarters */ ¿ out(0xbf); /* inverted question mark */ À out(0xc0); /* latin capital letter A with grave */ Á out(0xc1); /* latin capital letter A with acute */  out(0xc2); /* latin capital letter A with circumflex */ à out(0xc3); /* latin capital letter A with tilde */ Ä out(0xc4); /* latin capital letter A with diaeresis */ Å out(0xc5); /* latin capital letter A with ring above */ Æ out(0xc6); /* latin capital letter AE */ Ç out(0xc7); /* latin capital letter C with cedilla */ È out(0xc8); /* latin capital letter E with grave */ É out(0xc9); /* latin capital letter E with acute */ Ê out(0xca); /* latin capital letter E with circumflex */ Ë out(0xcb); /* latin capital letter E with diaeresis */ Ì out(0xcc); /* latin capital letter I with grave */ Í out(0xcd); /* latin capital letter I with acute */ Î out(0xce); /* latin capital letter I with circumflex */ Ï out(0xcf); /* latin capital letter I with diaeresis */ Ð out(0xd0); /* latin capital letter ETH */ Ñ out(0xd1); /* latin capital letter N with tilde */ Ò out(0xd2); /* latin capital letter O with grave */ Ó out(0xd3); /* latin capital letter O with acute */ Ô out(0xd4); /* latin capital letter O with circumflex */ Õ out(0xd5); /* latin capital letter O with tilde */ Ö out(0xd6); /* latin capital letter O with diaeresis */ × out(0xd7); /* multiplication sign */ Ø out(0xd8); /* latin capital letter O with stroke */ Ù out(0xd9); /* latin capital letter U with grave */ Ú out(0xda); /* latin capital letter U with acute */ Û out(0xdb); /* latin capital letter U with circumflex */ Ü out(0xdc); /* latin capital letter U with diaeresis */ Ý out(0xdd); /* latin capital letter Y with acute */ Þ out(0xde); /* latin capital letter THORN */ ß out(0xdf); /* latin small letter sharp s = ess-zed */ à out(0xe0); /* latin small letter a with grave */ á out(0xe1); /* latin small letter a with acute */ â out(0xe2); /* latin small letter a with circumflex */ ã out(0xe3); /* latin small letter a with tilde */ ä out(0xe4); /* latin small letter a with diaeresis */ å out(0xe5); /* latin small letter a with ring above */ æ out(0xe6); /* latin small letter ae */ ç out(0xe7); /* latin small letter c with cedilla */ è out(0xe8); /* latin small letter e with grave */ é out(0xe9); /* latin small letter e with acute */ ê out(0xea); /* latin small letter e with circumflex */ ë out(0xeb); /* latin small letter e with diaeresis */ ì out(0xec); /* latin small letter i with grave */ í out(0xed); /* latin small letter i with acute */ î out(0xee); /* latin small letter i with circumflex */ ï out(0xef); /* latin small letter i with diaeresis */ ð out(0xf0); /* latin small letter eth */ ñ out(0xf1); /* latin small letter n with tilde */ ò out(0xf2); /* latin small letter o with grave */ ó out(0xf3); /* latin small letter o with acute */ ô out(0xf4); /* latin small letter o with circumflex */ õ out(0xf5); /* latin small letter o with tilde */ ö out(0xf6); /* latin small letter o with diaeresis */ ÷ out(0xf7); /* division sign */ ø out(0xf8); /* latin small letter o with stroke */ ù out(0xf9); /* latin small letter u with grave */ ú out(0xfa); /* latin small letter u with acute */ û out(0xfb); /* latin small letter u with circumflex */ ü out(0xfc); /* latin small letter u with diaeresis */ ý out(0xfd); /* latin small letter y with acute */ þ out(0xfe); /* latin small letter thorn */ ÿ out(0xff); /* latin small letter y with diaeresis */ ƒ out(0x192); /* latin small f with hook = function */ Α out(0x391); /* greek capital letter alpha */ Β out(0x392); /* greek capital letter beta */ Γ out(0x393); /* greek capital letter gamma */ Δ out(0x394); /* greek capital letter delta */ Ε out(0x395); /* greek capital letter epsilon */ Ζ out(0x396); /* greek capital letter zeta */ Η out(0x397); /* greek capital letter eta */ Θ out(0x398); /* greek capital letter theta */ Ι out(0x399); /* greek capital letter iota */ Κ out(0x39a); /* greek capital letter kappa */ Λ out(0x39b); /* greek capital letter lambda */ Μ out(0x39c); /* greek capital letter mu */ Ν out(0x39d); /* greek capital letter nu */ Ξ out(0x39e); /* greek capital letter xi */ Ο out(0x39f); /* greek capital letter omicron */ Π out(0x3a0); /* greek capital letter pi */ Ρ out(0x3a1); /* greek capital letter rho */ Σ out(0x3a3); /* greek capital letter sigma */ Τ out(0x3a4); /* greek capital letter tau */ Υ out(0x3a5); /* greek capital letter upsilon */ Φ out(0x3a6); /* greek capital letter phi */ Χ out(0x3a7); /* greek capital letter chi */ Ψ out(0x3a8); /* greek capital letter psi */ Ω out(0x3a9); /* greek capital letter omega */ α out(0x3b1); /* greek small letter alpha */ β out(0x3b2); /* greek small letter beta */ γ out(0x3b3); /* greek small letter gamma */ δ out(0x3b4); /* greek small letter delta */ ε out(0x3b5); /* greek small letter epsilon */ ζ out(0x3b6); /* greek small letter zeta */ η out(0x3b7); /* greek small letter eta */ θ out(0x3b8); /* greek small letter theta */ ι out(0x3b9); /* greek small letter iota */ κ out(0x3ba); /* greek small letter kappa */ λ out(0x3bb); /* greek small letter lambda */ μ out(0x3bc); /* greek small letter mu */ ν out(0x3bd); /* greek small letter nu */ ξ out(0x3be); /* greek small letter xi */ ο out(0x3bf); /* greek small letter omicron */ π out(0x3c0); /* greek small letter pi */ ρ out(0x3c1); /* greek small letter rho */ ς out(0x3c2); /* greek small letter final sigma */ σ out(0x3c3); /* greek small letter sigma */ τ out(0x3c4); /* greek small letter tau */ υ out(0x3c5); /* greek small letter upsilon */ φ out(0x3c6); /* greek small letter phi */ χ out(0x3c7); /* greek small letter chi */ ψ out(0x3c8); /* greek small letter psi */ ω out(0x3c9); /* greek small letter omega */ ϑ out(0x3d1); /* greek small letter theta symbol */ ϒ out(0x3d2); /* greek upsilon with hook symbol */ ϖ out(0x3d6); /* greek pi symbol */ • out(0x2022); /* bullet = black small circle */ … out(0x2026); /* horizontal ellipsis = three dot leader */ ′ out(0x2032); /* prime = minutes = feet */ ″ out(0x2033); /* double prime = seconds = inches */ ‾ out(0x203e); /* overline = spacing overscore */ ⁄ out(0x2044); /* fraction slash */ ℘ out(0x2118); /* script capital P = power set */ ℑ out(0x2111); /* blackletter capital I = imaginary part */ ℜ out(0x211c); /* blackletter capital R = real part symbol */ ™ out(0x2122); /* trade mark sign */ ℵ out(0x2135); /* alef symbol = first transfinite cardinal */ ← out(0x2190); /* leftwards arrow */ ↑ out(0x2191); /* upwards arrow */ → out(0x2192); /* rightwards arrow */ ↓ out(0x2193); /* downwards arrow */ ↔ out(0x2194); /* left right arrow */ ↵ out(0x21b5); /* downwards arrow with corner leftwards */ ⇐ out(0x21d0); /* leftwards double arrow */ ⇑ out(0x21d1); /* upwards double arrow */ ⇒ out(0x21d2); /* rightwards double arrow */ ⇓ out(0x21d3); /* downwards double arrow */ ⇔ out(0x21d4); /* left right double arrow */ ∀ out(0x2200); /* for all */ ∂ out(0x2202); /* partial differential */ ∃ out(0x2203); /* there exists */ ∅ out(0x2205); /* empty set = null set = diameter */ ∇ out(0x2207); /* nabla = backward difference */ ∈ out(0x2208); /* element of */ ∉ out(0x2209); /* not an element of */ ∋ out(0x220b); /* contains as member */ ∏ out(0x220f); /* n-ary product = product sign */ ∑ out(0x2211); /* n-ary sumation */ − out(0x2212); /* minus sign */ ∗ out(0x2217); /* asterisk operator */ √ out(0x221a); /* square root = radical sign */ ∝ out(0x221d); /* proportional to */ ∞ out(0x221e); /* infinity */ ∠ out(0x2220); /* angle */ ∧ out(0x2227); /* logical and = wedge */ ∨ out(0x2228); /* logical or = vee */ ∩ out(0x2229); /* intersection = cap */ ∪ out(0x222a); /* union = cup */ ∫ out(0x222b); /* integral */ ∴ out(0x2234); /* therefore */ ∼ out(0x223c); /* tilde operator = varies with = similar to */ ≅ out(0x2245); /* approximately equal to */ ≈ out(0x2248); /* almost equal to = asymptotic to */ ≠ out(0x2260); /* not equal to */ ≡ out(0x2261); /* identical to */ ≤ out(0x2264); /* less-than or equal to */ ≥ out(0x2265); /* greater-than or equal to */ ⊂ out(0x2282); /* subset of */ ⊃ out(0x2283); /* superset of */ ⊄ out(0x2284); /* not a subset of */ ⊆ out(0x2286); /* subset of or equal to */ ⊇ out(0x2287); /* superset of or equal to */ ⊕ out(0x2295); /* circled plus = direct sum */ ⊗ out(0x2297); /* circled times = vector product */ ⊥ out(0x22a5); /* up tack = orthogonal to = perpendicular */ ⋅ out(0x22c5); /* dot operator */ ⌈ out(0x2308); /* left ceiling = apl upstile */ ⌉ out(0x2309); /* right ceiling */ ⌊ out(0x230a); /* left floor = apl downstile */ ⌋ out(0x230b); /* right floor */ ⟨ out(0x2329); /* left-pointing angle bracket = bra */ ⟩ out(0x232a); /* right-pointing angle bracket = ket */ ◊ out(0x25ca); /* lozenge */ ♠ out(0x2660); /* black spade suit */ ♣ out(0x2663); /* black club suit = shamrock */ ♥ out(0x2665); /* black heart suit = valentine */ ♦ out(0x2666); /* black diamond suit */ Œ out(0x152); /* latin capital ligature OE */ œ out(0x153); /* latin small ligature oe */ Š out(0x160); /* latin capital letter S with caron */ š out(0x161); /* latin small letter s with caron */ Ÿ out(0x178); /* latin capital letter Y with diaeresis */ ˆ out(0x2c6); /* modifier letter circumflex accent */ ˜ out(0x2dc); /* small tilde */   out(0x2002); /* en space */   out(0x2003); /* em space */   out(0x2009); /* thin space */ ‌ out(0x200c); /* zero width non-joiner */ ‍ out(0x200d); /* zero width joiner */ ‎ out(0x200e); /* left-to-right mark */ ‏ out(0x200f); /* right-to-left mark */ – out(0x2013); /* en dash */ — out(0x2014); /* em dash */ ‘ out(0x2018); /* left single quotation mark */ ’ out(0x2019); /* right single quotation mark */ ‚ out(0x201a); /* single low-9 quotation mark */ “ out(0x201c); /* left double quotation mark */ ” out(0x201d); /* right double quotation mark */ „ out(0x201e); /* double low-9 quotation mark */ † out(0x2020); /* dagger */ ‡ out(0x2021); /* double dagger */ ‰ out(0x2030); /* per mille sign */ ‹ out(0x2039); /* single left-pointing angle quotation mark */ › out(0x203a); /* single right-pointing angle quotation mark */ € out(0x20ac); /* euro sign */ "&#x"[0-9A-Fa-f]+";" { sscanf(yytext, "&#x%x", &num); out(num); } "&#X"[0-9A-Fa-f]+";" { sscanf(yytext, "&#X%x", &num); out(num); } "&#"[0-9]+";" { sscanf(yytext, "&#%d", &num); out(num); } "&"[^;]{1,8}";" { if (option.QUIET != TRUE) fprintf(stderr, "invalid entity or unescaped ampersand: %s\n", yytext); if (option.Replace == TRUE) out(option.rChar); else if (option.replace == TRUE) fputs(option.rString, yyout); else if (option.FORCE == TRUE) {} else fprintf(stdout, "&"); /* skip "&" */ yyless(1); } [^&]+ ECHO; . ECHO; %% void help () { fprintf(stdout, "\n" "htmlEnt2Char -- replaces HTML entities\n" " options:\n" " -C output encoding, actually supported:\n" " l1 lat1 latin1 iso-8859-1\n" " u8 utf-8 (default)\n" " -o output filename\n" " -f force: skip misspelled entities or\n" " entities not printable in given charset\n" " (see also -r or -R)\n" " -r replace unrecognized/unprintable entities\n" " by \n" " -R replace unrecognized/unprintable entities\n" " by a character given as , a Unicode code point\n" " Interpretation of follows the C convention:\n" " 0x.... for hexadecimal numbers\n" " 0.... for octal numbers\n" " .... for decimal numbers\n" " -q quiet: don't report errors, misspelled\n" " entities etc.\n" " -h | -? print this help and exit\n" " Other arguments will be read as input filenames.\n" " If no input files are given, input is read from stdin.\n" " If no output file is given, the text with replacements\n" " is written to stdout.\n\n"); printf("htmlEnt2Char, Sebastian Nagel (wastl@cis.uni-muenchen.de)\n"); exit(1); } int main (int argc, char **argv) { int c; while (1) { c = getopt(argc, argv, ":C:o:fqr:R:h?"); if (c == -1) break; switch (c) { case 'h': case '?': help(); case 'q': option.QUIET = TRUE; break; case 'C': if ((! strcmp(optarg, "l1")) || (! strcmp(optarg, "ISO-8859-1")) || (! strcmp(optarg, "iso-8859-1")) || (! strcmp(optarg, "lat1")) || (! strcmp(optarg, "latin1")) || (! strcmp(optarg, "latin-1")) || (! strcmp(optarg, "Latin-1"))) option.ENCODING = ISO_LATIN_1; else if ((! strcmp(optarg, "u8")) || (! strcmp(optarg, "utf-8")) || (! strcmp(optarg, "UTF-8"))) option.ENCODING = UTF8; break; case 'o': if (optarg != NULL && (yyout = fopen(optarg, "w")) == NULL) { fprintf(stderr, "Can't open %s for writing!\n", optarg); exit(1); } break; case 'f': option.FORCE = TRUE; break; case 'r': option.replace = TRUE; option.rString = optarg; break; case 'R': option.Replace = TRUE; sscanf(optarg, "0x%x", &option.rChar) || sscanf(optarg, "0%o", &option.rChar) || sscanf(optarg, "%u", &option.rChar); break; } } if (option.ENCODING == ISO_LATIN_1 && option.rChar >= 0xff) { fprintf(stderr, "Replacement character not in iso-8859-1!\n"); exit(1); } if (optind < argc) /* remaing ARGVs are filenames */ { while (optind < argc) { if (argv[optind] != NULL && (yyin = fopen(argv[optind], "r")) == NULL) { fprintf(stderr, "Can't read from %s!\n", argv[optind]); perror(argv[optind]); exit(1); } yylex(); optind++; } } else /* default: read yyin/stdin, when no input-files are given */ { yylex(); } return 0; }