/* * Copyright (C) 2004-2005 Vadim Berezniker * http://www.kryptolus.com * * This Program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2, or (at your option) * any later version. * * This Program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with GNU Make; see the file COPYING. If not, write to * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. * http://www.gnu.org/copyleft/gpl.html * */ #include "stdafx.h" #include "common.h" #include "kryTextFileReader.h" kryTextFileReader::kryTextFileReader(char *filename) { this->m_filename = filename; } kryTextFileReader::~kryTextFileReader() { this->Close(); } char *kryTextFileReader::GetFilename() { return this->m_filename; } gboolean kryTextFileReader::Open() { return this->Open(this->GetFilename()); } gboolean kryTextFileReader::Open(char *filename) { this->m_dataAvailable = 0; this->m_dataReturned = 0; this->m_isFirstLine = TRUE; this->m_chunks = NULL; this->m_encoding = ENCODING_ASCII; this->m_isEOF = FALSE; this->m_line = 0; struct _stat cfg; if(stat(filename, &cfg) < 0) return FALSE; this->m_file_size = cfg.st_size; #ifdef _WINDOWS this->m_fd = open(filename, _O_RDONLY | _O_BINARY); #else this->m_fd = open(filename, O_RDONLY); #endif if(this->m_fd == -1) return FALSE; this->Fill(); return TRUE; } void kryTextFileReader::Close() { if(this->m_fd != -1) { close(this->m_fd); this->m_fd = -1; } for(GList *ptr = this->m_chunks; ptr; ptr=ptr->next) { struct file_reader_chunk *chunk = (struct file_reader_chunk *) ptr->data; kry_free(chunk->data); kry_free(chunk); } this->m_chunks = NULL; } /* * Keeps reading from the input file until the internal buffer is filed. */ void kryTextFileReader::Fill() { // keep reading until we have enough data, or we reach the end of the file while(this->m_dataAvailable < kryTextFileReader::FULL_SIZE && !this->m_isEOF) { char *buffer = (char *) kry_malloc(kryTextFileReader::CHUNK_SIZE); int rv = read(this->m_fd, buffer, kryTextFileReader::CHUNK_SIZE); // read successful, add it to the chunk list if(rv > 0) { struct file_reader_chunk *chunk = kry_new0(struct file_reader_chunk); // if the read chunk is maximum size, we don't need to allocate more memory if(rv == kryTextFileReader::CHUNK_SIZE) { chunk->data = (unsigned char *) buffer; } else { chunk->data = (unsigned char *) kry_malloc(rv); memcpy(chunk->data, buffer, rv); kry_free(buffer); } chunk->length = rv; if(this->m_isFirstLine && chunk->length >= 3 && chunk->data[0] == 0xEF && chunk->data[1] == 0xBB && chunk->data[2] == 0xBF) { unsigned char *tmp = (unsigned char *) kry_malloc(chunk->length + 1 - 3); memcpy(tmp, chunk->data + 3, chunk->length - 3); tmp[chunk->length - 3] = 0; kry_free(chunk->data); chunk->data = tmp; chunk->length -= 3; this->SetEncoding(ENCODING_UTF8); } if(this->m_isFirstLine && chunk->length >= 2 && chunk->data[0] == 0xFF && chunk->data[1] == 0xFE) { unsigned char *tmp = (unsigned char *) kry_malloc(chunk->length + 1 - 2); memcpy(tmp, chunk->data + 2, chunk->length - 2); tmp[chunk->length - 2] = 0; kry_free(chunk->data); chunk->data = tmp; chunk->length -= 2; this->SetEncoding(ENCODING_UTF16); } this->m_isFirstLine = FALSE; this->m_chunks = g_list_append(this->m_chunks, chunk); this->m_dataAvailable += rv; } else { kry_free(buffer); } #ifdef _WINDOWS if(rv == -1 || (rv < kryTextFileReader::CHUNK_SIZE && eof(this->m_fd))) #else if(rv == -1 || (rv < kryTextFileReader::CHUNK_SIZE)) #endif this->m_isEOF = TRUE; } } /* * Checks for a UTF16 newline sequence (13 0 10 0) at the beginning of the given UTF16 string. * Returns true if the first 4 characters match the UTF16 newline sequence. * If the string is less than 4 characters, returns true if those characters match the endline partially. */ gboolean kryTextFileReader::CheckUTF8LineEnding(unsigned char *str, int len, int offset) { char utf16endl[4] = {13, 0, 10, 0}; if(len > 4) len = 4; for(int i = 0; i < len - offset; i++) { if(str[i] != utf16endl[i + offset]) return FALSE; } return TRUE; } /* * Returns one line from buffer. * The returned string must be freed. */ char *kryTextFileReader::GetLine() { GList *line_chunks = NULL; GList *ptr; int i; int prev_chunk_partial_endl = 0; int endl_length = 0; gboolean newlineFound; for(ptr = this->m_chunks; ptr; ptr = ptr->next) { struct file_reader_chunk *chunk = (struct file_reader_chunk *) ptr->data; newlineFound = FALSE; for(i = 0; i < chunk->length; i++) { gboolean isUTF16endl = FALSE; gboolean isASCIIendl = FALSE; // we found a newline or we reached the end of the available data if(this->GetEncoding() == ENCODING_UTF16) { int remlen = chunk->length - i; if(remlen >= 4 && CheckUTF8LineEnding(chunk->data + i, remlen, prev_chunk_partial_endl)) { isUTF16endl = TRUE; endl_length = 4; } else if(remlen < 4 && CheckUTF8LineEnding(chunk->data + i, remlen, prev_chunk_partial_endl)) { prev_chunk_partial_endl += remlen; break; } else if(prev_chunk_partial_endl) { prev_chunk_partial_endl = 0; } } else { int remlen = chunk->length - i; if(remlen >= 1 && chunk->data[i] == 10) { isASCIIendl = TRUE; endl_length = 1 + prev_chunk_partial_endl; } else if(remlen >= 2 && (chunk->data[i] == 13 && chunk->data[i+1] == 10)) { isASCIIendl = TRUE; endl_length = 2; } else if(remlen == 1 && chunk->data[i] == 13) { prev_chunk_partial_endl = 1; } else { prev_chunk_partial_endl = 0; } } if(isUTF16endl || isASCIIendl || (this->m_isEOF && ptr->next == NULL && i == chunk->length - 1 && ++i)) { struct file_reader_chunk *chunk_new = kry_new0(struct file_reader_chunk); char *data_new; // creating a chunk containing only the part of the line before the end of the line // and add it to the chunk list for the current line chunk_new->length = i; chunk_new->data = (unsigned char *) kry_malloc(i + 1); memcpy(chunk_new->data, chunk->data, i); chunk_new->data[i] = 0; line_chunks = g_list_append(line_chunks, chunk_new); if(!this->m_isEOF || ptr->next != NULL || i != chunk->length) this->m_dataReturned += endl_length; // updates the current chunk to only contain the data after the endline // so that it can be read next time if(chunk->length == i + endl_length - prev_chunk_partial_endl) { kry_free(chunk->data); kry_free(chunk); this->m_chunks = this->m_chunks->next; } else { int offset = (i + endl_length - prev_chunk_partial_endl); data_new = (char *) kry_malloc(chunk->length - offset); memcpy(data_new, chunk->data + offset, chunk->length - offset); kry_free(chunk->data); chunk->data = (unsigned char *) data_new; chunk->length -= offset; } this->m_dataAvailable -= (i + endl_length); this->Fill(); newlineFound = TRUE; break; } } if(newlineFound) { unsigned char *line; int line_length = 0; int offset = 0; // calculate the combined length of the chunks comprising this line for(ptr = line_chunks; ptr; ptr = ptr->next) { struct file_reader_chunk *chunk = (struct file_reader_chunk *) ptr->data; line_length += chunk->length; } line_length -= prev_chunk_partial_endl; this->m_dataReturned += line_length; line = (unsigned char *) kry_malloc(line_length + (this->GetEncoding() == ENCODING_UTF16 ? 2 : 1)); line[line_length] = 0; if(this->GetEncoding() == ENCODING_UTF16) line[line_length + 1] = 0; // fill up the line using the chunks for(ptr = line_chunks; ptr; ptr = ptr->next) { struct file_reader_chunk *chunk = (struct file_reader_chunk *) ptr->data; if(prev_chunk_partial_endl && ptr->next && ptr->next == g_list_last(line_chunks)) { memcpy(line + offset, chunk->data, chunk->length - prev_chunk_partial_endl); offset += (chunk->length - prev_chunk_partial_endl); } else { memcpy(line + offset, chunk->data, chunk->length); offset += chunk->length; } kry_free(chunk->data); kry_free(chunk); } this->m_line++; if(this->GetEncoding() == ENCODING_UTF16) { char *utf8str = KRY_TS(g_utf16_to_utf8((const gunichar2 *) line, -1, NULL, NULL, NULL)); kry_free(line); line = (unsigned char *) utf8str; } prev_chunk_partial_endl = 0; return (char *) line; } else { // endline not found, we add the current chunk to the chunks that will be used for the returned line line_chunks = g_list_append(line_chunks, chunk); // remove the current chunk from the list of chunks this->m_chunks = this->m_chunks->next; this->m_dataAvailable -= chunk->length; this->Fill(); // for loop uses this pointer to go to next item // since we modified m_chunks already, we must make ptr->next point to the address again ptr->next = this->m_chunks; } } return NULL; } enum file_encoding kryTextFileReader::GetEncoding() { return this->m_encoding; } void kryTextFileReader::SetEncoding(enum file_encoding encoding) { this->m_encoding = encoding; } double kryTextFileReader::GetProgress() { return ((double) this->m_dataReturned / this->m_file_size); } int kryTextFileReader::GetLineNumber() { return this->m_line; }