/*****************************************************************/ /* Reading routines for MS-Word, MS-Write and text files */ /* */ /* This file is part of catdoc project */ /* (c) Victor Wagner 1996-2003, (c) Alex Ott 2003 */ /*****************************************************************/ #ifdef HAVE_CONFIG_H #include #endif #include #include #include "catdoc.h" unsigned short int buffer[PARAGRAPH_BUFFER]; static unsigned char read_buf[256]; static int buf_is_unicode; /**************************************************************************/ /* Just prints out content of input file. Called when file is not OLE */ /* stream */ /* Parameters - f - file to copy out. header - first few bytes of file, */ /* which have been already read by format recognition code, but should */ /* be output anyway */ /**************************************************************************/ void copy_out (FILE *f,char *header) { char *buf=(char *)buffer; int count,i; long offset; if (get_unicode_char == get_word8_char) { /* non-word file and -u specified. Trying to guess which kind of * unicode is used */ if ((unsigned char)header[0]==0xFE && (unsigned char)header[1]==0xFF) { get_unicode_char = get_utf16msb; fputs(convert_char(header[2]<<8|header[3]),stdout); fputs(convert_char(header[4]<<8|header[5]),stdout); fputs(convert_char(header[6]<<8|header[7]),stdout); } else if ((unsigned char)header[0]!=0xFF || (unsigned char)header[1]!=0xFE) { int c,j,d; /* if it is not utf16, assume it is UTF8. We are told -u, * aren't we */ get_unicode_char = get_utf8; i=0; while (i<8) { c=(unsigned char)header[i++]; if (c >=0x80) { if ( c<0xE0) { c=(c & 0x1F); count =1; } else { c=(c & 0xF); count = 2; } for (j=0;j0) { buffer[++bufptr]=0; output_paragraph(buffer); } } return 0; } /**********************************************************************/ /* Reads file from MS-Word 97 and above file. Takes in account strange* * situation that unicode and non-unicode 256-byte blocks could be * * intermixed in word file * * * * Parameters: * * * * f - file to read * * offset - position of the character inside file (to determine * * possible block boundaries * **********************************************************************/ int get_word8_char(FILE *f,long *offset,long fileend) { int count,i,u; char c; if ((i=(*offset)%256) ==0) { count=catdoc_read(read_buf,1,256,f); memset(read_buf+count,0,256-count); buf_is_unicode=0; if (*offset+(long)count>fileend) { count=fileend-*offset; } while (i