Friday, March 18, 2016

simple grep replacement for sorted data

The idea here is that if we are looking at a file which is composed of lines sorted in order, and you have some idea what the lines you're interested in will start with, you should be able to efficiently pull out your target, even if the file is large.

For example, many log files start with a timestamp composed in a sortable order (i.e., year, month, day, hour, minute, second), where you may well have a fairly precise idea of what you want (e.g., from 12:02:05 on March 3, 2016 and the 10 seconds following). Normal grep will scan the entire file, which is a painful exercise if the file is many gigabytes.  sgrep will in contrast seek to the interesting region and then search for a regular expression pattern only within that region.

So for the example given above, one would search as follows if looking at an artifactory request log:

        sgrep 20160303120205 20160303120215 some_regex_pattern /private/artifactory/logs/request.log
Here's the code to do it, changing some minute searches to sub-second searches:
class Sgrep
        attr_accessor :patt
        attr_accessor :beginning_of_significance
        attr_accessor :ending_of_significance
        attr_accessor :fn
        attr_accessor :f
        def initialize(beginning_of_significance, ending_of_significance, patt, fn)
                self.beginning_of_significance = beginning_of_significance
                self.ending_of_significance = ending_of_significance
                self.patt = Regexp.new(patt)
                self.fn = fn
                if !File.readable?(fn)
                        STDERR.puts "sgrep: #{fn}: No such file or directory"
                        exit(1)
                end
                self.f = File.open(fn, "r")
                puts "sgrep looking for \"#{patt}\" in file #{fn}, bounded by the significant region starting with \"#{beginning_of_significance}\" and ending with \"#{ending_of_significance}\"..." if Sgrep.trace
                #        header = fh.readline
                # Process the header
                #     while(line = fh.gets) != nil
                #         #do stuff
                #     end
                # end"@@
        end
        def search_sequentially_from(pos)
                if pos > 0
                        f.seek(pos-1, IO::SEEK_SET)
                        self.seek_next_line
                else
                        f.seek(pos, IO::SEEK_SET)
                end
                while !self.f.eof? do
                        next_line_start = self.f.tell
                        line = self.f.gets
                        if line.start_with?(self.beginning_of_significance) || line > self.beginning_of_significance
                                f.seek(next_line_start, IO::SEEK_SET)
                                puts "searched sequentially to #{next_line_start} (seeing #{line})" if Sgrep.trace
                                return
                        end
                end
        end
        def seek_beginning_of_significance()
                lower_bound = 0
                upper_bound = File.size(self.fn)
                puts "lower_bound=#{lower_bound}, upper_bound=#{upper_bound}" if Sgrep.trace
                while upper_bound > lower_bound do
                        midpoint = (lower_bound + ((upper_bound - lower_bound) / 2)).to_i
                        f.seek(midpoint, IO::SEEK_SET)
                        self.seek_next_line
                        next_line_start = self.f.tell
                        line = self.f.gets
                        puts "see #{line.chomp}, lower_bound=#{lower_bound}, upper_bound=#{upper_bound}, midpoint=#{midpoint}, next_line_start=#{next_line_start}" if Sgrep.trace
                        if line.start_with?(self.beginning_of_significance)
                                puts "match" if Sgrep.trace
                                if upper_bound > next_line_start
                                        upper_bound = next_line_start
                                else
                                        break
                                end
                        elsif line < self.beginning_of_significance
                                puts "under" if Sgrep.trace
                                lower_bound = self.f.tell+1
                        else
                                puts "over" if Sgrep.trace
                                upper_bound = midpoint-1
                        end
                end
                self.search_sequentially_from(lower_bound)
        end
        def seek_next_line()
                while !self.f.eof? do
                        if self.f.getc == "\n"
                                return
                        end
                end
        end
        def significant_lines
                while !self.f.eof? do
                        line = f.gets
                        if line.start_with?(self.ending_of_significance) || line < self.ending_of_significance
                                yield line
                        else
                                return
                        end
                end
                return
        end
        def search()
                #return `grep "#{self.patt}" #{self.fn}`
                self.seek_beginning_of_significance

                exit_code = 1
                self.significant_lines do | line |
                        if self.patt.match(line)
                                exit_code = 0
                                print line
                        end
                end
                return exit_code
        end
        class << self
                attr_accessor :trace
        end
end

j = 0
while ARGV[j].start_with?("-") do
        case ARGV[j]
        when "-v"
                Sgrep.trace = true
        end
        j += 1
end
beginning_of_significance = ARGV[j]
ending_of_significance = ARGV[j+1]
patt = ARGV[j+2]
fn = ARGV[j+3]
g = Sgrep.new(beginning_of_significance, ending_of_significance, patt, fn)
exit(g.search())