The idea here is that if we are looking at a file which is composed of lines sorted in order, and you have some idea what the lines you're interested in will start with, you should be able to efficiently pull out your target, even if the file is large.
For example, many log files start with a timestamp composed in a sortable order (i.e., year, month, day, hour, minute, second), where you may well have a fairly precise idea of what you want (e.g., from 12:02:05 on March 3, 2016 and the 10 seconds following). Normal grep will scan the entire file, which is a painful exercise if the file is many gigabytes. sgrep will in contrast seek to the interesting region and then search for a regular expression pattern only within that region.
So for the example given above, one would search as follows if looking at an artifactory request log:
sgrep 20160303120205 20160303120215 some_regex_pattern /private/artifactory/logs/request.log
Here's the code to do it, changing some minute searches to sub-second searches:
class Sgrep
attr_accessor :patt
attr_accessor :beginning_of_significance
attr_accessor :ending_of_significance
attr_accessor :fn
attr_accessor :f
def initialize(beginning_of_significance, ending_of_significance, patt, fn)
self.beginning_of_significance = beginning_of_significance
self.ending_of_significance = ending_of_significance
self.patt = Regexp.new(patt)
self.fn = fn
if !File.readable?(fn)
STDERR.puts "sgrep: #{fn}: No such file or directory"
exit(1)
end
self.f = File.open(fn, "r")
puts "sgrep looking for \"#{patt}\" in file #{fn}, bounded by the significant region starting with \"#{beginning_of_significance}\" and ending with \"#{ending_of_significance}\"..." if Sgrep.trace
# header = fh.readline
# Process the header
# while(line = fh.gets) != nil
# #do stuff
# end
# end"@@
end
def search_sequentially_from(pos)
if pos > 0
f.seek(pos-1, IO::SEEK_SET)
self.seek_next_line
else
f.seek(pos, IO::SEEK_SET)
end
while !self.f.eof? do
next_line_start = self.f.tell
line = self.f.gets
if line.start_with?(self.beginning_of_significance) || line > self.beginning_of_significance
f.seek(next_line_start, IO::SEEK_SET)
puts "searched sequentially to #{next_line_start} (seeing #{line})" if Sgrep.trace
return
end
end
end
def seek_beginning_of_significance()
lower_bound = 0
upper_bound = File.size(self.fn)
puts "lower_bound=#{lower_bound}, upper_bound=#{upper_bound}" if Sgrep.trace
while upper_bound > lower_bound do
midpoint = (lower_bound + ((upper_bound - lower_bound) / 2)).to_i
f.seek(midpoint, IO::SEEK_SET)
self.seek_next_line
next_line_start = self.f.tell
line = self.f.gets
puts "see #{line.chomp}, lower_bound=#{lower_bound}, upper_bound=#{upper_bound}, midpoint=#{midpoint}, next_line_start=#{next_line_start}" if Sgrep.trace
if line.start_with?(self.beginning_of_significance)
puts "match" if Sgrep.trace
if upper_bound > next_line_start
upper_bound = next_line_start
else
break
end
elsif line < self.beginning_of_significance
puts "under" if Sgrep.trace
lower_bound = self.f.tell+1
else
puts "over" if Sgrep.trace
upper_bound = midpoint-1
end
end
self.search_sequentially_from(lower_bound)
end
def seek_next_line()
while !self.f.eof? do
if self.f.getc == "\n"
return
end
end
end
def significant_lines
while !self.f.eof? do
line = f.gets
if line.start_with?(self.ending_of_significance) || line < self.ending_of_significance
yield line
else
return
end
end
return
end
def search()
#return `grep "#{self.patt}" #{self.fn}`
self.seek_beginning_of_significance
exit_code = 1
self.significant_lines do | line |
if self.patt.match(line)
exit_code = 0
print line
end
end
return exit_code
end
class << self
attr_accessor :trace
end
end
j = 0
while ARGV[j].start_with?("-") do
case ARGV[j]
when "-v"
Sgrep.trace = true
end
j += 1
end
beginning_of_significance = ARGV[j]
ending_of_significance = ARGV[j+1]
patt = ARGV[j+2]
fn = ARGV[j+3]
g = Sgrep.new(beginning_of_significance, ending_of_significance, patt, fn)
exit(g.search())