#!/usr/bin/perl -w
# hgrep:
# Grep headers of newsspool/maildir/mbox style files: two very special
# features here:
# 1. Everything after first blank line is ignored. (In mbox files pattern
# matching resumes after the next line matching /^From /.)
# 2. Continued header lines are handled properly, either as multiple lines
# in the output (default) or joined.
#
# Possible future enhancement(s):
# grep MIME segment headers
# grep headers of attachments/bodies of type "message/rfc822"
# decode 8bit content in message headers (RFC1342)
# option to specify which headers to match on or to ignore
#
# Copyright 1998 Eli the Bearded.
#
use strict;
use integer;
use vars qw { $VERSION %options };
$VERSION = '1.0';
# Set some defaults for the options.
%options = (
## show the file name in the output, if set
showfile => 1,
## show the line number, if set
showline => 0,
## show matched text, if set
matches => 1,
## unlink matching files, if set
unlink => 0,
## put separators between matches, if set
separator => 0,
## join lines of continued headers, if set
join => 0,
## count files/matches, if set
count => 0,
## only show final count, if set
silentcount => 0,
## treat the arguments as directories and process all files in them, if set
## (useful for specifing newsspool directories that might otherwise be too
## long for a single command line)
readdir => 0,
## treat as mbox format, if set
## this meanst that rather than one header per file, there can be many, each
## begins after a line matching /^From /
mbox => 0,
## show a letter count, if set
## useful for knowing which letters in a mbox contributed each match
lcount => 0,
## show usage and exit, if set
help => 0,
);
# This is the most hackish part of this. It is a standard little bit of
# code that will toggle 0/1 boolean values of variables based on their
# presence in the command line. This makes for very simple options parsing
# that *need not change at all when I add new options*.
while ($ARGV[0] =~ /^--?(\w+)/) {
my $tmp = $1;
if (defined ($options{$tmp})) {
# Toggle value
$options{$tmp} ^= 1;
} else {
warn "$0: Bad option: $1; use --help for help\n";
}
shift;
}
if ($options{help}) {
print < 1 message per file}
(unset) -lcount show a letter count {for mbox files}
(unset) -help show usage and exit
All options are toggles. Default values shown in (parentheses). If one
was previously set, it is now unset, if previously unset it is not set.
Options can be included multiple times. -unlink is a dangerous option.
See perlre(1) for regexp help.
HGrepUsage
exit(0);
}
# Now grab the RE from the command line.
my $pat=shift;
my $file;
my $countm = 0;
my $countf = 0;
my $sep = ''; # sep is the file separator
# Process the files.
foreach $file (@ARGV) {
my $filename;
my $rc;
if ($options{readdir}) {
# "$file" is really a directory
opendir (D, $file) or die "Could not open directory $file:\n$!\n";
while (defined($filename=readdir(D))) {
print $sep; # print before setting, in case this is the first pass
# If we are just dealing with the current directory, don't
# prepend the directory to the filename.
if ($file eq '.') {
$rc = &checkfile(\$filename);
} else {
$rc = &checkfile(\"$file/$filename");
}
if ($options{separator} and $rc) {
$sep = "------\n";
} else {
$sep = '';
}
}
closedir D;
} else {
# we are just dealing with files
print $sep;
$rc = &checkfile(\$file);
if ($options{separator} and $rc) {
$sep = "------\n";
} else {
$sep = '';
}
}
if ($countf > 1 and $options{count}) {
print "$countf files with $countm matches\n";
}
}
# Check a file.
sub checkfile {
my $file = shift;
my $last;
my $matchc;
my $countl;
my $separ = ''; # separ is the intrafile match separator
if (open(IN,"){
# Since we have not choped/chomped $_, if we have anything begining with
# whitespace and at least two bytes, we are not at the end of the headers.
if (/^\s+./) {
# Append
if ($options{join}) {
chomp $last;
$last.=" $_"
} else {
$last.=$_
}
} else {
my $field;
if (defined($last)) {
$field=0;
if ($last =~ /$pat/os) {
# Found a match, do something
print $separ;
chomp $last;
$matchc++;
$countm++;
if ($options{showfile}) {
print ":" if $field;
print "$$file";
$field++;
}
if ($options{lcount}) {
print ":" if $field;
print "$countl";
$field++;
}
# Showline should print the line number of the start of the
# header matched.
if ($options{showline}) {
print ":" if $field;
print (($. - ($last =~ tr:\n::) - 1));
$field++;
}
if ($options{matches}) {
print ":" if $field;
print "$last";
$field++;
}
print "\n" if $field;
$separ = "---\n" if $options{separator};
goto ENDFILE if (!$options{matches} or $options{unlink})
and !$options{count};
} # found a match
}
if (/^\s$/) {
last unless $options{mbox};
$countl++;
# Skip to next message
while() {
last if /^From /
}
}
# Set
$last=$_
}
} # while
ENDFILE:
# $. resets on close
close IN;
unlink $$file if ($options{unlink} and $matchc);
print "$$file-$matchc\n" if $options{count} and !$options{silentcount};
$countf++ if $matchc;
return $matchc;
} # if open IN
else {
warn "Can't open $$file: $!\n";
return 0;
}
} # end &checkfile
__END__
=head1 NAME
hgrep - grep through RFC822-style headers, skipping body part of message.
=head1 README
hgrep - grep through RFC822-style headers, skipping body part of message.
=head1 DESCRIPTION
hgrep greps headers of newsspool/maildir/mbox style files with two very
special features.
1. Everything after first blank line is ignored. (In mbox files pattern
matching resumes after the next line matching /^From /.)
2. Continued header lines are handled properly, either as multiple lines
in the output (default) or joined.
=head1 USAGE
hgrep [options] perlre [file ... | directory ...]
Options:
=over 4
=item *
( set) -showfile
Show the file name in the output.
=item *
(unset) -showline
Show the line number of start of matching header.
=item *
( set) -matches
Show matched text.
=item *
(unset) -unlink
Unlink (delete) matching files. A dangerous option to use.
=item *
(unset) -separator
Put separators between matches. A line with six hyphens (------) will
appear between matching files; a line with three hyphens (---) will
appear between matches within files.
=item *
(unset) -join
Join lines of continued headers in output.
=item *
(unset) -count
Count matching files and matches per file.
=item *
(unset) -silentcount
With -count, only show final count rather than a count for each
file.
=item *
(unset) -readdir
Treat the arguments as directories and process all files in them.
Useful for specifing newsspool directories that might otherwise be too
long for a single command line.
=item *
(unset) -mbox
Treat as mbox format (look for more than one message per file).
=item *
(unset) -lcount
Show a letter count (for mbox files). 'Letter count' is the message
number in the file for the match.
=item *
(unset) -help
Show usage and exit.
=back
All options are toggles. Default values shown in (parentheses). If one
was previously set, it is now unset, if previously unset it is not set.
Options can be included multiple times. -unlink is a dangerous option.
=head1 PREREQUISITES
The regular expressions available are limited to your installed version
of perl. The C, C, and C pragma modules are used.
=head1 COREQUISITES
No optional CPAN modules needed.
=head1 OSNAMES
A unix-like directory structure is assumed.
=head1 SEE ALSO
L(1) for regular expression help
=head1 COPYRIGHT
Copyright 1998 by Eli the Bearded / Benjamin Elijah Griffin.
Released under the same license(s) as Perl.
=head1 AUTHOR
Eli the Bearded originally wrote this to tool to help manage incoming
files for a moderated newsgroup. The -unlink option was added for nuking
spam.
=pod SCRIPT CATEGORIES
Mail
News
=cut