############################################################################ # # Name: bmp2text.icn # # Title: convert a bitmap to a text-chunk # # Author: Richard L. Goerwitz # # Version: 2.6 # ############################################################################ # # This file contains bitmap_2_text(bitmap, filename). Recall that # bitmaps are just a series of fixed-length bitfields used to mark # divisions within a text. The procedure retrieve() locates words in # an index file, and returns a list of these bitmaps, which point to # divisions within the original text file - divisions within which a # given indexed word found by retrieve() occurs. The procedure # bitmap_2_filename() simply takes a given bitmap and finds the text # with which it is associated in the main text file. # # Note that bitmap_2_text() does not seek directly to the correct # location within "filename" (arg 2). It first breaks down the # bitmap into a less precise form via an offset table (read in via # the .OFS file), looks up the precise location of the bitmap in the # .UNT file, and then finally seeks up to that location in the main # text file, decodes the text it finds at that location, and then # returns the decoded section as a string. The reason # bitmap_2_text() does this is that makeind (the indexing routine # which creates data files for retrieve() and bitmap_2_text()) does # not store the offset within the main text for every bitmap. It # just saves the locations of major blocks in the .OFS file, then # keeps a full list on disk in the .UNT file. This is basically just # a space-saving device. It would eat up too much core memory to # keep a list of every offset for every chunk of text marked out by a # bitmap in filename. # # Note also that, although retrieve() returns a list of bitmaps, bit- # map_2_text(bitmap, filename) expects a single bitmap as its first # argument. It is better that text be retrieved as needed, one chunk # at a time, and not stuffed en masse into core memory as soon as it # is retrieve()'d. # ############################################################################ # # Links: ./indexutl.icn, ./initfile.icn # # See also: retrieve.icn, makeind.icn # ############################################################################ # Declared in indexutl.icn. # record is(FS, s_len, len, no, is_case_sensitive, hufftree) # global IS # Declared in initfile.icn. # global filestats # record Fs(ind_filename, bmp_filename, lim_filename, unt_filename, # IS, ofs_table) procedure bitmap_2_text(bitmap, filename) local cut_down_bitmap, upto_field, offset, line, base_value_mask, base_value, location static t, old_main_filename, in_main_file, in_unt_file # global filestats, IS initial { t := table() old_main_filename := "" } # Check for sloppy programming. /filename & abort("bitmap_2_text","you called me without a filename",29) if old_main_filename ~==:= filename then { # # If necessary, initialize stats for the current file. # if /filestats | /filestats[filename] then initfile(filename) # see initfile.icn # open full text file for reading every close(\in_main_file | \in_unt_file) in_main_file := open(filename) | abort("bitmap_2_text", "can't open "||filename, 26) in_unt_file := open(filestats[filename].unt_filename) | abort("bitmap_2_text", "can't open .UNT file for "||filename, 27) } # Reset IS to current file. IS := filestats[filename].IS # Determine offset to seek to by using the bitmap->offset table # for the current file (arg 2). The name of the bitmap_offset # table is stored in filestats[filename].ofs_table. # upto_field := 1 < (filestats[filename].IS.no * 2) / 3 | 1 cut_down_bitmap := ishift(bitmap, -(IS.no - upto_field) * IS.len) offset := \filestats[filename].ofs_table[cut_down_bitmap] | fail # Seek to offset, and begin looking for the string equiv. of # bitmap (arg 1). # seek(in_unt_file, offset) | abort("bitmap_2_text","can't seek to offset "||offset, 27) # # Find the major text division for bitmap using the offset table # (in filestats[filename].ofs_table), look up its precise loca- # tion in the .UNT file, then seek to that location in the main # text file and decode whatever text is encoded at that location. # # # First figure out how to tell if we've gone too far. Basically, # mask out the lower bits, and record the value of the upper bits. # Some fooling around is necessary because bitmaps may use large # ints, making it impossible to use icom() in a naive manner. # If the upper bits of the bitmaps being read change, then we've # gone too far. # base_value_mask := icom(2^((IS.no - upto_field) * IS.len)- 1) base_value := iand(bitmap, base_value_mask) while line := read(in_unt_file) do { line ? { location := digits_2_bitmap(tab(find("\t"))) # in indexutl.icn if bitmap = location then { move(1) # move past tab character # block_decode() is in huffcode.icn; decodes the encoded # verse and returns the result (should be an ASCII string) seek(in_main_file, offset := integer(tab(0))) | abort("bitmap_2_text","unable to seek to "||offset,28) return block_decode(in_main_file, IS.hufftree) } else { if base_value ~= iand(location, base_value_mask) then fail } } } # we should have returned by now fail end