############################################################################ # # Name: retrieve.icn # # Title: retrieve locations of words in database file # # Author: Richard L. Goerwitz # # Version: 1.25 # ############################################################################ # # Retrieve(pattern, filename) retrieves all locations containing # words matching pattern (arg1) in filename (arg2), placing them in a # list. "Locations" are integer-coded pointers to places in filename # where corresponding text is located. To actually retrieve that # block of text, you must call bitmap_2_text(location, filename). # Retrieve() only gathers up a list of locations in filename # containing words which match pattern. # # The reason retrieve() doesn't do the logical thing - namely, to # "retrieve" text itself - is that doing so might use a *lot* of # memory. It is far more economical to retrieve text only when a # given chunk is requested via bitmap_2_text(). # # Note: Patterns must match words in their entirety. For instance, # retrieve("dog",filename) would only retrieve exact matches for the # word "dog" in filename. To catch, say, "doggie" as well, it would # be necessary to call retrieve with a regular expression that # matched both dog and doggie (e.g. retrieve("dog.*",filename)). # ############################################################################ # # Links: codeobj.icn, ./indexutl.icn, ./binsrch.icn, ./initfile.icn # ./findre.icn shquote.icn # # See also: makeind.icn, bmp2text.icn # ############################################################################ link codeobj, shquote # The following globals contain stats for current file (here, arg2). # global filestats # declared in initfile.icn # global IS # declared in indexutl.icn procedure retrieve(pattern, filename, inverse) local bitmap_set, bmp_file, in_egrep, intext, cmd, offset, pattern2, line static is_UNIX, egrep_filename initial { if is_UNIX := find("UNIX",&features) then # If egrep is available, use it. It's fast. egrep_filename := "egrep" # egrep_filename := "/usr/local/bin/gnuegrep" } # Check for sloppy programming. /filename & abort("retrieve","you called me without a filename",22) # Initialize important variables. # if /filestats | /filestats[filename] then initfile(filename) # see initfile.icn bitmap_set := set() # list will contain locations of hits IS := filestats[filename].IS # re-initialize IS for current file if /IS.is_case_sensitive then pattern := map(pattern) # Open bitmap file. # bmp_file := open(filestats[filename].bmp_filename) | abort("retrieve","can't open "||filestats[filename].bmp_filename, 29) # Search index. # if are_metas(pattern) then { # NB: are_metas() can be found in indexutl.icn # If there are metacharacters in pattern, do a regexp pattern match. # The .IND file goes: line ::= key \t other-stuff. pattern := "^(" || pattern || ")\t" # If UNIX, then use egrep to search index. # if \is_UNIX then { # Set up command line to be passed to /bin/sh. First make # sure we don't have any apostrophe's hanging around to # screw up the command line to be passed to /bin/sh, then # put together a command line to be passed to egrep. pattern2 := shquote(pattern) # from the IPL cmd := egrep_filename || " " || pattern2 || " " || filestats[filename].ind_filename || " 2>&1" # open pipe in_egrep := open(cmd, "rp") | abort("retrieve","can't open pipe from\n\t"||cmd, 20) # grep .IND index file every line := !in_egrep do { # Kludge, but it's the only way to tell if there's an error. find("error"|"grep", line) & { # Define some routine here that issues a warning; there # is no need to actually abort! (\err_message)("Regular expression syntax error.") | stop("retrieve: regexp syntax error") break } line ? (tab(find("\t")+1), offset := integer(tab(0))) bitmap_set ++:= retrieve_bitmaps(offset, bmp_file) } close(in_egrep) # ...otherwise (i.e. if not UNIX) use findre() instead of egrep # } else { # Probably MS-DOS or something else. SLOW, SLOW! intext := open(filestats[filename].ind_filename) | abort("retrieve","can't open index file", 21) # grep .IND file every line := !intext do { line ? { if findre(pattern) then { offset := integer(tab(0)) bitmap_set ++:= retrieve_bitmaps(offset, bmp_file) } } } every close(bmp_file | intext) } # If *not* are_metas(pattern), then do a binary search of the index. # No need to worry about is_UNIX, egrep, findre(), etc. # } else { if offset := binary_index_search(pattern, filestats[filename].ind_filename) then bitmap_set ++:= retrieve_bitmaps(offset, bmp_file) } # If inverse (arg 3) is nonnull, then invert the sense of the search. # Do this by knocking out those parts of the full bitmap set that are # in the bitmap_set, and then assigning the result to bitmap_set. # if \inverse then bitmap_set := (all_bitmaps(bmp_file) -- bitmap_set) # We're done. See if there were any hits. If so, sort & return a # list (lists are easier for the display routines to handle). # close(bmp_file) # if *bitmap_set > 0 then return sort(bitmap_set) else fail end procedure retrieve_bitmaps(offset, f, return_a_list) local bitmap_list, bitmap_length, i, tmp, how_many_bitmaps, bits_needed, inverse_signal bits_needed := 24 inverse_signal := 8388608 seek(f, offset) bitmap_length := ((IS.len * IS.no) <= seq(0,8)) tmp := read_int(f, bits_needed) how_many_bitmaps := iand(inverse_signal-1, tmp) # Slower way. # bitmap_list := list(how_many_bitmaps) # every i := 1 to how_many_bitmaps do # bitmap_list[i] := read_int(f, bitmap_length) # Slow way. bitmap_list := list() every i := 1 to how_many_bitmaps do put(bitmap_list, read_int(f, bitmap_length)) # If the inverse signal bit is turned on, then the BMP file stores # non-occurrences for a given key (rather than occurrences). Saves # space for a/the/and, etc., but necessitates collecting all bitmaps # for the current file into a set a set difference. The procedure # all_bitmaps does the collecting. if iand(inverse_signal, tmp) ~= 0 then { bitmap_list := (all_bitmaps(f) -- set(bitmap_list)) } if \return_a_list then return sort(bitmap_list) else { if type(bitmap_list) == "list" then return set(bitmap_list) else return bitmap_list } end procedure all_bitmaps(f, return_a_list) # At offset 1 in the BMP file is the list of all bitmaps in the # full file. Returns the set of these, unless a list is desired, # in which case one must call all_bitmaps() with a nonnull second # argument. return retrieve_bitmaps(1, f, return_a_list) end