/* dictfmt.c -- * Created: Sun Jul 20 20:17:11 1997 by faith@acm.org * Revised: Sat Sep 27 23:47:04 2003 by faith@acm.org * Copyright 1997, 1998, 2003 Rickard E. Faith (faith@acm.org) * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the * Free Software Foundation; either version 1, or (at your option) any * later version. * * This program is distributed in the hope that it will be useful, but * WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * General Public License for more details. * * You should have received a copy of the GNU General Public License along * with this program; if not, write to the Free Software Foundation, Inc., * 675 Mass Ave, Cambridge, MA 02139, USA. */ #include "dictP.h" #include "str.h" #include #include #include #include #include #include #if HAVE_WCTYPE_H #include #endif #include #if HAVE_GETOPT_H #include #endif #define FMT_MAXPOS 72 #define FMT_INDENT 0 #define JARGON 1 #define FOLDOC 2 #define EASTON 3 #define PERIODIC 4 #define HITCHCOCK 5 #define CIA1995 6 #define VERA 7 #define INDEXONLY 8 #define BSIZE 10240 static int Debug; static FILE *str; /* defaults to creating ASCII database */ static int utf8_mode = 0; static int bit8_mode = 0; static int index_keep_orig_mode = 0; static int allchars_mode = 0; static int cs_mode = 0; static int quiet_mode = 0; static int dictfmt_ver_mode = 1; static int indexonly_base64 = 0; static const char *hw_separator = ""; static const char *idxdat_separator = "\034"; static int without_hw = 0; static int without_header = 0; static int without_url = 0; static int without_time = 0; static int without_info = 0; static int break_headwords = 0; static FILE *fmt_str; static int fmt_indent; static int fmt_pos; static int fmt_pending; static int fmt_hwcount; static int fmt_maxpos = FMT_MAXPOS; static int fmt_ignore_headword = 0; static int ignore_hw_url = 0; static int ignore_hw_shortname = 0; static int ignore_hw_info = 0; static int ignore_hw_def_strat = 0; static const char *locale = NULL; static const char *default_strategy = NULL; static const char *mime_header = NULL; static str_Pool alphabet_pool = NULL; static int type = 0; /* analog to wcswidth(3) */ static int mbswidth_ (const char *s) { int ret = 0; wchar_t wchar; int width; size_t len; mbstate_t ps; memset (&ps, 0, sizeof (ps)); while (*s){ len = mbrtowc__ (&wchar, s, MB_CUR_MAX__, &ps); switch (len){ case (size_t) (-1): case (size_t) (-2): return -1; default: width = wcwidth__ (wchar); if (-1 == width) width = 1; /* we also count non-printable characters */ ret += width; } s += len; } return ret; } static void init (const char *fn) { maa_init (fn); alphabet_pool = str_pool_create (); } static int print_alphabet (const void *symbol, void *arg) { printf ("%s: %s\n", (char *) arg, (const char *) symbol); return 0; } static void destroy (void) { str_pool_destroy (alphabet_pool); alphabet_pool = NULL; // maa_shutdown (); } static void destroy_and_exit (int exit_status) { destroy (); exit (exit_status); } static void fmt_openindex( const char *filename ) { char buffer [1024]; if (bit8_mode || utf8_mode || allchars_mode) snprintf( buffer, sizeof (buffer), "sort -t '\t' -k 1,3" ); else snprintf( buffer, sizeof (buffer), "sort -t '\t' -df -k 1,3" ); if (filename){ strlcat (buffer, "> ", sizeof (buffer)); strlcat (buffer, filename, sizeof (buffer)); } strlcat (buffer, "\n", sizeof (buffer)); if (!(fmt_str = popen( buffer, "w" ))) { fprintf( stderr, "Cannot open %s for write\n", buffer ); destroy_and_exit (1); } } static void fmt_newline( void ) { int i; if (!str){ return; } if (fmt_ignore_headword){ return; } fputc('\n', str); for (i = 0; i < fmt_indent; i++){ fputc(' ', str); } fmt_pos = 0; fmt_pending = 0; } static void fmt_wrap_and_print (const char *s) { size_t len; int print_space; if (utf8_mode){ len = mbswidth_ (s); if (len == (size_t) -1) err_fatal (__FUNCTION__, "'%s' is not a valid utf-8 string\n", s); /* err_fatal (__FUNCTION__, "'%s' is not a valid utf-8 string or contains non-printable symbols\n", s);*/ }else{ len = strlen (s); } print_space = (fmt_pending || !len); if (fmt_pos && fmt_pos + print_space + len > fmt_maxpos){ fmt_newline(); } if (fmt_pending || !len){ fputc (' ', str); ++fmt_pos; } if (len > 0){ fprintf (str, "%s", s); fmt_pos += len; fmt_pending = 1; } } static void fmt_string( const char *s ) { char *sdup = NULL; char *pt = NULL; char *p = NULL; #if 0 char *t; #endif size_t len; if (!str) return; assert (s); if (fmt_ignore_headword){ return; } sdup = malloc( strlen(s) + 1 ); p = pt = sdup; #if 1 strcpy( sdup, s ); #else for (t = sdup; *s; s++) { if (*s == '_') *t++ = ' '; else *t++ = *s; } *t = '\0'; #endif while ((pt = strchr(p, ' '))) { *pt = '\0'; fmt_wrap_and_print (p /*pt == p ? " " : p*/); p = pt + 1; } if (*p) fmt_wrap_and_print (p); free(sdup); } #ifdef HAVE_UTF8 /* makes anagram of the 8-bit string 's' if length == -1 then str is 0-terminated string */ static void stranagram_8bit (char *s, int length) { char* i = s; char* j; char v; assert (s); if (length == -1) length = strlen (s); j = s + length - 1; while (i < j){ v = *i; *i = *j; *j = v; ++i; --j; } } /* makes anagram of the utf-8 string 's' Returns non-zero if success, 0 otherwise */ static int stranagram_utf8 (char *s) { size_t len; char *p; mbstate_t ps; assert (s); memset (&ps, 0, sizeof (ps)); for (p = s; *p; ){ len = mbrlen__ (p, MB_CUR_MAX__, &ps); if ((int) len < 0) return 0; /* not a UTF-8 string */ if (len > 1) stranagram_8bit (p, len); p += len; } stranagram_8bit (s, -1); return 1; } #endif /* Remove spaces at the end of the string */ static char *trim_right (char *s) { wchar_t mbc; int len; if (!utf8_mode){ len = strlen (s); while (len > 0 && isspace ((unsigned char) s [len - 1])){ s [--len] = 0; } return s; }else{ #ifdef HAVE_UTF8 if (!stranagram_utf8 (s)) abort (); do { len = mbtowc__ (&mbc, s, MB_CUR_MAX__); assert (len >= 0); if (len == 0 || !iswspace__ (mbc)) break; s += len; }while (1); if (!stranagram_utf8 (s)) abort (); #else abort(); #endif return s; } } /* Remove spaces at the beginning of the string */ static char *trim_left (char *s) { wchar_t mbc; int len; if (!utf8_mode){ while (isspace((unsigned char) *s)){ ++s; } return s; }else{ #ifdef HAVE_UTF8 do { len = mbtowc__ (&mbc, s, MB_CUR_MAX__); assert (len >= 0); if (len == 0 || !iswspace__ (mbc)) break; s += len; }while (1); #else abort(); #endif return s; } } /* Remove spaces at the beginning and the end of the string */ static char *trim_lr (char *s) { return trim_left (trim_right (s)); } static int is_headword_special (const char *hw) { return (!strncmp (hw, "00-database", 11) || !strncmp (hw, "00database", 10)); } static void increase_fmt_hwcount () { if (!quiet_mode){ if (fmt_hwcount && !(fmt_hwcount % 100)) { fprintf( stderr, "%10d headwords\r", fmt_hwcount ); } } ++fmt_hwcount; } static void write_hw_to_index ( const char *word, const char *data, int start, int end) { int len = 0; char *trimmed_new_word = NULL; char *new_word = NULL; if (!word) return; increase_fmt_hwcount (); len = strlen (word); if (len > 0){ new_word = malloc (len + 1); if (!new_word){ perror ("malloc failed"); destroy_and_exit (1); } if (tolower_alnumspace ( word, new_word, allchars_mode, cs_mode, utf8_mode)) { fprintf (stderr, "'%s' is not a UTF-8 string", word); destroy_and_exit (1); } fprintf( fmt_str, "%s\t%s\t", new_word, b64_encode(start) ); fprintf( fmt_str, "%s", b64_encode(end-start) ); if (!data && index_keep_orig_mode && strcmp (word, new_word)){ data = word; } if (data && !is_headword_special (word)){ fprintf( fmt_str, "\t%s\n", data); }else{ fprintf( fmt_str, "\n"); } free (new_word); } } static char *split_and_write_hw_to_index ( char *word, char *data, int start, int end) { char *p = word; char *sep = NULL; char *idx_data_sep = NULL; size_t sep_len = 0; if (!data && idxdat_separator){ idx_data_sep = strstr (word, idxdat_separator); if (idx_data_sep){ sep_len = strlen (idxdat_separator); idx_data_sep [0] = 0; data = idx_data_sep + sep_len; } } do { sep = NULL; if (hw_separator [0] && !is_headword_special (word)){ sep = strstr (p, hw_separator); if (sep) *sep = 0; } write_hw_to_index (trim_lr (p), data, start, end); if (!sep) break; p = sep + strlen (hw_separator); }while (1); return p; } static int contain_nonascii_symbol (const char *word) { if (!word) return 0; while (*word){ if (!isascii ((unsigned char) *word)) return 1; ++word; } return 0; } static void update_alphabet (const char *word) { char *p; size_t len = 0; mbstate_t ps; char old_char; if (!word || !strncmp (word, "00-database", 11) || !strncmp (word, "00database", 10)) { return; } len = strlen (word); p = (char *) alloca (len + 1); tolower_alnumspace (word, p, allchars_mode, cs_mode, utf8_mode); memset (&ps, 0, sizeof (ps)); while (*p){ len = utf8_mode ? mbrlen__ (p, MB_CUR_MAX__, &ps) : 1; assert ((int) len >= 0); old_char = p [len]; p [len] = 0; str_pool_find (alphabet_pool, p); p [len] = old_char; p += len; } } /* return 1 if word should be skipped */ static int fmt_newheadword_special (const char *word) { if ( word && (!strcmp (word, "00-database-default-strategy") || !strcmp (word, "00databasedefaultstrategy"))) { if (ignore_hw_def_strat){ fmt_ignore_headword = 1; return 1; } /* we will ignore following occurences of 00-database-default-strategy*/ ignore_hw_def_strat = 1; } if ( word && (!strcmp (word, "00-database-url") || !strcmp (word, "00databaseurl"))) { if (ignore_hw_url){ fmt_ignore_headword = 1; return 1; } /* we will ignore all the following occurences of 00-database-url*/ ignore_hw_url = 1; } if ( word && (!strcmp (word, "00-database-short") || !strcmp (word, "00databaseshort"))) { if (ignore_hw_shortname){ fmt_ignore_headword = 1; return 1; } /* we will ignore all the following occurences of 00-database-short*/ ignore_hw_shortname = 1; } if ( word && (!strcmp (word, "00-database-info") || !strcmp (word, "00databaseinfo"))) { if (ignore_hw_info){ fmt_ignore_headword = 1; return 1; } /* we will ignore all the following occurences of 00-database-short*/ ignore_hw_info = 1; } return 0; } static void fmt_test_nonascii (const char *word) { if (!bit8_mode && !utf8_mode){ if (contain_nonascii_symbol (word)){ fprintf (stderr, "\n8-bit head word \"%s\"is encountered while \"C\" locale is used\n", word); destroy_and_exit (1); } } } static void fmt_newheadword( const char *word ) { static char prev[1024] = ""; static int start = 0; static int end; char * sep = NULL; char * p; if (fmt_newheadword_special (word)) return; update_alphabet (word); fmt_ignore_headword = 0; fmt_test_nonascii (word); fmt_indent = 0; // fmt_newline(); fflush(stdout); end = ftell(str); if (fmt_str && *prev) { p = split_and_write_hw_to_index (prev, NULL, start, end); } if (word) { strlcpy(prev, word, sizeof (prev)); start = end; } if ( word && !without_hw && strncmp (word, "00-database", 11) && strncmp (word, "00database", 10)) { p = prev; if (hw_separator[0] && break_headwords) while ((sep = strstr (p, hw_separator))) { *sep = 0; fmt_string (p); fmt_newline(); *sep = hw_separator[0]; p = sep + strlen (hw_separator); } fmt_string (p); fmt_newline(); } } static void fmt_closeindex( void ) { if (type != INDEXONLY){ fmt_newheadword (NULL); } if (fmt_str){ pclose( fmt_str ); } if (!quiet_mode){ fprintf( stderr, "%12d headwords\n", fmt_hwcount ); } } static void banner( FILE *out_stream ) { fprintf( out_stream, "dictfmt-%s\n", DICT_VERSION ); fprintf( out_stream, "Copyright 1997-2000 Rickard E. Faith (faith@cs.unc.edu)\n" "Copyright 2002-2007 Aleksey Cheusov (vle@gmx.net)\n" "\n"); } static void license( void ) { static const char *license_msg[] = { "This program is free software; you can redistribute it and/or modify it", "under the terms of the GNU General Public License as published by the", "Free Software Foundation; either version 1, or (at your option) any", "later version.", "", "This program is distributed in the hope that it will be useful, but", "WITHOUT ANY WARRANTY; without even the implied warranty of", "MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU", "General Public License for more details.", "", "You should have received a copy of the GNU General Public License along", "with this program; if not, write to the Free Software Foundation, Inc.,", "675 Mass Ave, Cambridge, MA 02139, USA.", 0 }; const char **p = license_msg; banner ( stdout ); while (*p) fprintf( stdout, " %s\n", *p++ ); } static void help( FILE *out_stream ) { static const char *help_msg[] = { "Usage: dictfmt -c5|-t|-e|-f|-h|-j|-p [-u url] [-s name] [options] basename", " dictfmt -i|-I [options]", "Create a dictionary databse and index file for use by a dictd server", "", "-c5 headwords are preceded by a line containing at least \n\ 5 underscore (_) characters", "-t implies -c5, --without-headword and --without-info options", "-e file is in html format", "-f headwords start in col 0, definitions start in col 8", "-j headwords are set off by colons", "-p headwords are preceded by %p, with %d on following line", "-i reformat stdin having three-column .index file format", "-u URL of site where database was obtained", "-s name of the database", "--license\n\ -L display copyright and license information", "--version\n\ -V display version information", "-D debug", "--utf8 for creating utf-8 dictionary", "--quiet\n\ --silent\n\ -q quiet operation", "--help display this help message", "--locale specifies the locale used for sorting.\n\ if no locale is specified, the \"C\" locale is used.", "--allchars all characters (not only alphanumeric and space)\n\ will be used in search if this argument is supplied", "--headword-separator sets headword separator which allows\n\ several words to have the same definition\n\ Example: autumn%%%fall can be used\n\ if '--headword-separator %%%' is supplied", "--index-data-separator sets index/data separator which allows\n\ to explicitely set fourth column in .index file,\n\ the default is \"\\034\"", "--break-headwords multiple headwords will be written on separate lines\n\ in the .dict file. For use with '--headword-separator.", "--index-keep-orig fourth column in .index file stores original headword\n\ which is returned by MATCH command", "--case-sensitive Create .index/.dict files for case sensitive search", "--without-headword headwords will not be copied to .dict file", "--without-header header will not be copied to DB info entry", "--without-url URL will not be copied to DB info entry", "--without-time time of creation will not be copied to DB info entry", "--without-info DB info entry will not be created.\n\ This may be useful if 00-database-info headword\n\ is expected from stdin (dictunformat outputs it).", "--columns Set the number of columns for wrapping text\n\ before writing it to .dict file.\n\ If it is zero, wrapping is off.", "--default-strategy Sets the default search strategy for the database.\n\ Special entry 00-database-default-strategy is created\n\ for this purpose.", "--mime-header Sets MIME header stored in .data file which\n\ prepend definition\n\ when client sent OPTION MIME to `dictd'", "--without-ver do not create 00-database-dictfmt- entry in .index", 0 }; const char **p = help_msg; banner( out_stream ); while (*p) fprintf( out_stream, "%s\n", *p++ ); } static void set_utf8bit_mode (const char *locale_) { const char *charset = NULL; int ascii_mode; if (!setlocale(LC_COLLATE, locale_) || !setlocale(LC_CTYPE, locale_)){ fprintf (stderr, "invalid locale '%s'\n", locale_); destroy_and_exit (2); } charset = nl_langinfo (CODESET); utf8_mode = !strcmp (charset, "UTF-8"); #if !HAVE_UTF8 if (utf8_mode){ err_fatal ( __FUNCTION__, "utf-8 support was disabled at compile time\n"); } #endif ascii_mode = !strcmp (charset, "ANSI_X3.4-1968") || !strcmp (charset, "US-ASCII") || (locale_ [0] == 'C' && locale_ [1] == 0); bit8_mode = !ascii_mode && !utf8_mode; #ifndef SYSTEM_UTF8_FUNCS if (utf8_mode){ fprintf (stderr, "Using --locale xx_YY.UTF-8 for creating utf-8 database is deprecated,\n\ use --utf8 option instead.\n"); } #endif } static const char string_unknown [] = "unknown"; static const char *url = string_unknown; static const char *sname = string_unknown; static void fmt_headword_for_def_strat (void) { if (!default_strategy) return; fmt_newheadword ("00-database-default-strategy"); fmt_string (default_strategy); fmt_newline (); } static void fmt_headword_for_MIME_header (void) { int old_max_pos = fmt_maxpos; if (!mime_header) return; fmt_maxpos = INT_MAX; /* no wrap for this special headword */ fmt_newheadword ("00-database-mime-header"); fmt_string (mime_header); fmt_newline (); fmt_maxpos = old_max_pos; /* restore */ } static void fmt_headword_for_url (void) { fmt_newheadword ("00-database-url"); fmt_string (url); fmt_newline (); ignore_hw_url = 1; } static void fmt_headword_for_alphabet (void) { const char *key; size_t len; size_t sum_size = 0; str_Position pos; char *alphabet; fmt_newheadword("00-database-alphabet"); STR_ITERATE (alphabet_pool, pos, key){ sum_size += strlen (key); } alphabet = xmalloc (sum_size + 1); alphabet [0] = 0; STR_ITERATE (alphabet_pool, pos, key){ strcat (alphabet, key); } fmt_string (alphabet); xfree (alphabet); fmt_newline (); } static void fmt_headword_for_shortname (void) { fmt_newheadword("00-database-short"); fmt_string ("00-database-short"); fmt_newline (); fmt_string( " " ); fmt_string( sname ); fmt_newline (); ignore_hw_shortname = 1; } static void fmt_headword_for_info (void) { time_t t; char buffer[BSIZE]; fmt_newheadword("00-database-info"); if (!without_time){ fmt_string("This file was converted from the original database on:" ); fmt_newline(); time(&t); snprintf( buffer, sizeof (buffer), " %25.25s", ctime(&t) ); buffer [strlen (buffer) - 1] = 0; /* for removing \n */ fmt_string( buffer ); fmt_newline(); fmt_newline(); } if (!without_url){ fmt_string( "The original data is available from:" ); fmt_newline(); fmt_string( " " ); fmt_string( url ); fmt_newline(); fmt_newline(); } if (!without_header){ static const char msg [] = "The original data was distributed with the notice shown below." " No additional restrictions are claimed. Please redistribute" " this changed version under the same conditions and restriction" " that apply to the original version."; if (fmt_maxpos == INT_MAX){ /* --columns 0 */ fmt_maxpos = FMT_MAXPOS; fmt_string(msg); fmt_maxpos = INT_MAX; }else{ fmt_string(msg); } fmt_newline(); fmt_newline(); } } static void fmt_headword_for_utf8 (void) { if (utf8_mode){ fmt_newheadword("00-database-utf8"); fmt_newline(); } } static void fmt_headword_for_8bit (void) { if (bit8_mode){ fmt_newheadword("00-database-8bit-new"); fmt_newline(); } } static void fmt_headword_for_allchars (void) { if (allchars_mode){ fmt_newheadword("00-database-allchars"); fmt_newline(); } } static void fmt_headword_for_casesensitive (void) { if (cs_mode){ fmt_newheadword("00-database-case-sensitive"); fmt_newline(); } } static void fmt_headword_for_dictfmt_ver (void) { char ver [200]; char *p; snprintf (ver, sizeof (ver), "00-database-dictfmt-%s", DICT_VERSION); if (dictfmt_ver_mode){ fmt_newheadword (ver); fmt_string (ver); fmt_newline (); } } /* ...before reading the input */ static void fmt_predefined_headwords_before () { if (type == INDEXONLY) return; fmt_headword_for_utf8 (); fmt_headword_for_8bit (); fmt_headword_for_allchars (); fmt_headword_for_casesensitive (); fmt_headword_for_def_strat (); fmt_headword_for_MIME_header (); fmt_headword_for_dictfmt_ver (); if (url != string_unknown){ /* -u option is not applied and we add 00-database-url headword */ fmt_headword_for_url (); } if (sname != string_unknown){ /* -s option is not applied and we add 00-database-short headword */ fmt_headword_for_shortname (); } if (!without_info){ fmt_headword_for_info (); } } /* ...after reading the input */ static void fmt_predefined_headwords_after () { if (type == INDEXONLY) return; fmt_headword_for_url (); fmt_headword_for_shortname (); fmt_headword_for_alphabet (); } static int xatoi (const char *nptr) { char *end; long ret = strtol (nptr, &end, 10); if (end == nptr || end [0] != 0) err_fatal (__FUNCTION__, "bad decimal '%s'\n", nptr); return (int) ret; } int main( int argc, char **argv ) { int c; char buffer[BSIZE]; char buffer2[BSIZE]; char indexname[1024]="/nonexistentfile.index"; char dataname[1024]="/nonexistentfile.data"; int header = 0; char *pt; char *s, *d; unsigned char *buf; const char *basename = NULL; struct option longopts[] = { { "help", 0, 0, 501 }, { "locale", 1, 0, 502 }, { "allchars", 0, 0, 503 }, { "headword-separator", 1, 0, 504 }, { "without-headword", 0, 0, 505 }, { "without-header", 0, 0, 506 }, { "without-url", 0, 0, 507 }, { "without-time", 0, 0, 508 }, { "without-info", 0, 0, 509 }, { "columns", 1, 0, 510 }, { "break-headwords", 0, 0, 511 }, { "quiet", 0, 0, 'q' }, { "silent", 0, 0, 'q' }, { "version", 0, 0, 'V' }, { "license", 0, 0, 'L' }, { "default-strategy", 1, 0, 512 }, { "mime-header", 1, 0, 513 }, { "utf8", 0, 0, 514 }, { "index-keep-orig", 0, 0, 515 }, { "case-sensitive", 0, 0, 516 }, { "index-data-separator", 1, 0, 517 }, { "without-ver", 0, 0, 518 }, { NULL, 0, 0, 0 }, }; init (argv[0]); while ((c = getopt_long( argc, argv, "qVLjvfepiIhDu:s:c:t", longopts, NULL )) != EOF) switch (c) { case 'q': quiet_mode = 1; break; case 'L': license(); destroy_and_exit (1); break; case 'V': banner( stdout ); destroy_and_exit (1); break; case 501: help( stdout ); destroy_and_exit (1); break; case 'j': type = JARGON; break; case 'f': type = FOLDOC; break; case 'e': type = EASTON; break; case 'p': type = PERIODIC; break; case 'i': type = INDEXONLY; break; case 'I': type = INDEXONLY; indexonly_base64 = 1; break; case 'h': type = HITCHCOCK; without_hw = 1; break; case 'v': type = VERA; break; case 'D': ++Debug; break; case 'u': url = optarg; break; case 's': sname = optarg; break; case 'c': switch (*optarg) { case '5': type = CIA1995; break; default: fprintf( stderr, "Only CIA 1995 (-c5) currently supported\n" ); destroy_and_exit (1); } break; case 502: locale = optarg; break; case 503: allchars_mode = 1; break; case 504: hw_separator = optarg; break; case 505: without_hw = 1; break; case 506: without_header = 1; break; case 507: without_url = 1; break; case 508: without_time = 1; break; case 509: without_info = 1; break; case 510: fmt_maxpos = atoi (optarg); if (fmt_maxpos <= 0){ fmt_maxpos = INT_MAX; } break; case 511: break_headwords = 1; break; case 512: default_strategy = str_copy (optarg); break; case 513: mime_header = str_copy (optarg); break; case 514: bit8_mode = 0; utf8_mode = 1; break; case 515: index_keep_orig_mode = 1; break; case 516: cs_mode = 1; break; case 517: idxdat_separator = optarg; break; case 518: dictfmt_ver_mode = 0; break; case 't': without_info = 1; without_hw = 1; type = CIA1995; fmt_maxpos = INT_MAX; break; default: help (stderr); destroy_and_exit (1); } if (type != INDEXONLY) { if (optind + 1 != argc) { help (stderr); destroy_and_exit (1); } basename = argv [optind]; } if (locale) set_utf8bit_mode (locale); setenv("LC_ALL", "C", 1); /* this is for 'sort' subprocess */ if (!basename){ fmt_openindex (NULL); }else{ if (-1 == snprintf ( indexname, sizeof (indexname), "%s.index", basename )|| -1 == snprintf ( dataname, sizeof (dataname), "%s.dict", basename )) { err_fatal (__FUNCTION__, "Too long filename\n"); } fmt_openindex( indexname ); } // str = stdout; if (basename && !Debug){ if (!(str = fopen (dataname, "w"))) { fprintf(stderr, "Cannot open %s for write\n", dataname); destroy_and_exit (1); } } fmt_predefined_headwords_before (); while (fgets(buf = buffer,BSIZE-1,stdin)) { if (strlen(buffer)) buffer[strlen(buffer)-1] = '\0'; /* remove newline */ switch (type) { case HITCHCOCK: if (strlen(buffer) == 1) { header = 1; continue;; } if (header) { strcpy( buffer2, buffer ); if ((pt = strchr( buffer2, ','))) { *pt = '\0'; fmt_newheadword(buffer2); *pt = ','; // fprintf (stderr, "HW=`%s`\n", buffer2); // if (*pt == '\n') // ++pt; // fprintf (stderr, "DEF=`%s`\n", pt); // buf = pt; } } break; case EASTON: strcpy( buffer2, buffer ); for (s = buffer2, d = buffer; *s; ++s) { if (*s == '<') { header = 1; switch (s[1]) { case 'I': *d++ = '_'; break; case 'A': if (s[3] == 'N') goto skip; *d++ = '{'; break; case 'P': goto skip; case 'B': goto copy; case '/': switch(s[2]) { case 'I': *d++ = '_'; break; case 'A': *d++ = '}'; break; case 'B': goto copy; default: fprintf( stderr, "Unknown tag: %s (%c%c)\n", buffer2, s[1], s[2] ); destroy_and_exit (1); } break; default: fprintf( stderr, "Unknown tag: %s (%c)\n", buffer2, s[1] ); destroy_and_exit (1); } while (*s && *s != '>') s++; continue; } copy: *d++ = *s; } *d = '\0'; #if 0 printf( "BEFORE: %s\n", buffer2 ); printf( "AFTER: %s\n", buffer ); #endif if (*buffer == '<') { switch (buffer[1]) { case 'B': if ((pt = strstr( buffer+3, " - " ))) { *pt = '\0'; fmt_newheadword(buffer+3); continue; } else { fprintf( stderr, "No end: %s\n", buffer ); destroy_and_exit (1); } break; default: fprintf( stderr, "Unknown: %s\n", buffer ); destroy_and_exit (1); } } else { if (buffer[0] == ' ' && buffer[1] == ' ') fmt_newline(); } break; case JARGON: switch (*buffer) { case ':': header = 1; if ((pt = strchr( buffer+1, ':' ))) { s = pt + 1; if (*s == ':') ++s; *pt = '\0'; fmt_newheadword (buffer+1); memmove( buf, s, strlen(s)+1 ); /* move \0 also */ } break; case '*': case '=': case '-': if (buffer[0] == buffer[1] && buffer[0] == buffer[2] && buffer[0] == buffer[3]) continue; /* Skip lines with *'s and ='s */ } break; case PERIODIC: switch (*buffer) { case '%': if (buffer[1] == 'h') { if (!header) { header = 1; continue; } else { fmt_newheadword(buffer+3); continue; } } else if (buffer[1] == 'd') { continue; } break; } break; case VERA: switch (*buffer) { case '@': if (header && !strncmp(buffer, "@item ", 6)) { fmt_newheadword(buffer+6); } continue; } if (!header) { fmt_string("This is a special GNU edition of V.E.R.A.,"); fmt_string("a list dealing with computational acronyms."); fmt_newline(); fmt_string("Copyright 1993/2002 Oliver Heidelbach "); fmt_newline(); fmt_newline(); fmt_string( "Permission is granted to copy, distribute and/or modify this document" " under the terms of the GNU Free Documentation License, Version 1.1" " or any later version published by the Free Software Foundation;" " with no Invariant Sections, with no Front-Cover Texts, and with " " no Back-Cover Texts."); fmt_newline(); fmt_newline(); fmt_string( "Within the above restrictions the distribution of this" " document is explicitly encouraged and I hope you'll find" " it of some value."); fmt_newline(); fmt_newline(); fmt_string( "This dictionary has nothing to do with Systems Science Inc. " "or its products."); fmt_newline(); ++header; } break; case FOLDOC: if (*buffer && *buffer != ' ' && *buffer != '\t') { ++header; if (header >= 3) { fmt_newheadword(buffer); continue; } } if (header < 3 && without_info) continue; if (*buf == ' '){ ++buf; } if (*buf == '\t') { memmove( buf+2, buf, strlen(buf)+1 ); /* move \0 */ buf[0] = buf[1] = buf[2] = ' '; } break; case CIA1995: if (*buffer == '@') { buf++; } else if (strncmp(buffer, "_____",5) == 0) { fgets(buf = buffer,BSIZE-1,stdin); /* empty line */ fgets(buf = buffer,BSIZE-1,stdin); if (strlen(buffer)) buffer[strlen(buffer)-1] = '\0'; /* remove newline */ buf = trim_left (buf); if (*buf != '\0') { fmt_indent = 0; fmt_newheadword (buf); continue; } } break; case INDEXONLY: { const char *headword = NULL; const char *offset = NULL; const char *size = NULL; const char *data = NULL; size_t len = strlen (buffer); int i_offset = 0; int i_size = 0; headword = strtok (buffer, "\t"); if (!headword){ fprintf (stderr, "strtok failed 1\n"); exit (1); } offset = strtok (NULL, "\t"); if (!offset){ fprintf (stderr, "strtok failed 2\n"); exit (1); } size = strtok (NULL, "\t\n"); if (!size){ fprintf (stderr, "strtok failed 3\n"); exit (1); } data = strtok (NULL, "\n"); if (indexonly_base64){ i_offset = (int) b64_decode (offset); }else{ i_offset = xatoi (offset); } if (indexonly_base64){ i_size = (int) b64_decode (offset); }else{ i_size = xatoi (size); } write_hw_to_index (headword, data, i_offset, i_offset + i_size); } break; default: fprintf(stderr, "Unknown input format type %d\n", type ); destroy_and_exit (2); } if (buf){ fmt_string(buf); fmt_indent = 0; fmt_newline(); // fmt_indent = FMT_INDENT; } skip:; } fmt_predefined_headwords_after (); fmt_closeindex(); if (str) fclose(str); destroy (); return 0; }