#!/usr/bin/perl -w
=head1 NAME
dcheck - a date-in-file checker and adjuster
=head1 SCRIPT CATEGORIES
Web
=head1 SYNOPSIS
dcheck [-riyuefcbht] [file/path...] [-m pattern]
[-d datestyle] [-x ext] [-a path] [-v level]
[--no-recurs] [--match=pattern] [--no-bin]
[--date-style=datestyle] [--short-year]
[--mdate-err] [--nodate-err] [--force-date] [--correct]
[--backup] [--bak-ext=ext] [--bak-path=path]
[--help] [--settings] [--verbose=level]
=head1 README
This script checks given files for dates matching a given format and compare
this with the date of last modification of the file.
Actually the script can do the following:
=over 4
=item *
Find and correct false dates.
=item *
Find files without any date.
=item *
Force dates if no date found.
=item *
Find files with more than one date.
=back
The main motivation for writing this script was to adjust the dates of last
modification in today over 1.100 pages on a constantly growing website. Later
I used it for other web-projects, for memos, source codes...
You specify all options by using the command line. The following is the hole
story about this options:
=head2 Directories and Files
All non-options found in commandline assumed to be either a file or a path.
Files and paths can be mixed together (also with options) but the paths should
be valid and will direct send to the find-routine from C for
(recursivly) scanning. B You can use your shell-globbing commandline to
submit many files but they also have to pass the C match if given. If no
file and no path given the current workdirectory is used.
=over 4
=item -r, --no-recurs
No recursive file scanning.
=item -m, --match
Regular expression for files to match. Use Doublequotes to submit complex RXs.
B The complete filename including the path is tested. Multiple entries
are possible but the order is important for speed. All files will checked if
not set. (the default behavior)
=item -i, --no-bin
If you don't want to check binary files you have to set this flag. The test is
made by Perls Filetestoperator.
=back
=head2 Date to match
=over 4
=item -d, --date-style
The style of the date to match.
The syntax for submitting datestyles is similar to the syntax of the C
command known in the unix-world. You specify a line containing some directives
which matching and producing parts of a date. All directives starts with '%'
followed by one char. All other chars will used unchanged.
Possible directives:
%d %D Day
%m %M Month
%y %Y Year
%t %T Minute
%s %S Second
%h %H Hour
%% simple matchs and produces '%'
The Year-directive looks for numeric fields with exactly two (%y) or exactly
four (%Y) digits. All other directives are looking for one or two digits.
Use the uppercase chars to produce numeric fields with leading zeroes.
This datestyle is used to create an format-string for the printf-routine and
an regular expression to match the date. Also the order of the items extracted
from there. You can view this variables by using the C flag.
Examples for datestyles:
# simple
%D-%M-%Y => 20-07-1973
%M/%D/%Y => 07/20/73
# with time
%D-%M/%Y %H:%T => 20-07/1973 07:10
%D-%M-%Y %H:%T:%S => 20-07-1973 07:10:24
# short
%d/%m %h:%m => 20/7 7:10
# with static stuff
modified %D-%M-%Y => modified 20-07-1973
=item -y, --short-year
Produces 2-digit years. This is bad! If not set and datestyles like %d-%m-%y
used all dates causes errors because the script produces 4-digit years by
default. Use only if there is a good reason for keeping short years. The
script can be used to convert short to long years this way:
# files containing dd/mm/yy
dcheck -d "%d/%m/%y" -c
# many errors follows, all errors will corrected with long year
# now you have to change your datestyle to match long years
dcheck -d "%d/%m/%Y"
# no more errors
=back
=head2 Error-correcting and backup
All found errors will displayed on STDERR and possible corrected.
=over 4
=item -u, --mdate-err
Files with more than one date causes errors. This errors could'nt be corrected
but will reported with linenumbers for easy finding them.
=item -e, --nodate-err
Files without date causes errors.
=item -f, --force-date
Any files must have a date otherwise the file will expanded by a line
containing the date in the actual format. Sets C. No changes will made
unless the C flag is given. B Be very very carefull with that flag!
Using it without neither the C nor the C-flag can destroy all your
binary stuff!
=item -c, --correct
Determines if found errors will be corrected.
=item -b, --backup
Keeping backups of changed files. I
=item -x, --bak-ext
Extension for backup-files. Not in use if a backup-path is specified.
=item -a, --bak-path
Path to hold the backups. If this path is given the directory-structure of the
original data is copied there if backups necessary. The file-names keeping
untouched. You can simply delete this backuptree later or use it to restored
your original data. Overrides C.
=back
=head2 User-interface
=over 4
=item -h, --help
Shows a usage-summary and exits.
=item -s, --settings
Shows the actual settings (and dependence) and exits.
=item -v, --verbose
Show statusmessages during run. 0=silent...3=noisy.
=back
=head1 EXAMPLES
# This checks every found file in ~/public_html for a date
# matching the given style and report false dates. No changes.
dcheck ~/public_html -d "last changed: %M/%D/%Y"
# The same but files without any date causes errors.
dcheck ~/public_html -e -d "last changed: %M/%D/%Y"
# This checks all files found in the current workdir and matching
# the -m regex (should be .htm and .html). Dates will corrected
# if necessary and backups of the changed files will stored
# in ~/bak with the original names.
dcheck -a ~/bak -b -c -m "\.html?$" -d "last changed: %D/%M/%Y"
# The same but the backup files will stored in the same directory
# like the original file with the extension "bak".
dcheck -b -c -m "\.html?$" -d "last changed: %D/%M/%Y"
# The same but files without any date will expanded by one
# line containing the date of last modification in the given format.
# No backups will made (bad).
dcheck -c -f -m "\.html?$" -d "last changed: %D/%M/%Y"
# The same but the date contains time-information
dcheck -c -f -m "\.html?$" -d "last changed: %D/%M/%Y %H:%T:%S"
=head1 COPYRIGHT
Copyright (c) 1999 S. All rights
reserved. This program is free software. You may modify and/or distribute it
under the same terms as Perl itself. This copyright notice must remain
attached to the file.
=head1 TODO
=over 4
=item *
Matching and producing Names of Days/Months. Any suggestions?
=back
=cut
## ----------------------------------------------------------------------------
## now we begin with working code
my $VERSION = 0.9;
use strict;
$|++;
use File::Find;
use File::Basename;
use File::Path;
use File::Copy;
use Getopt::Long;
use Cwd;
## ----------------------------------------------------------------------------
## definitions section:
## (the next values are defaults and will overwritten by using the commandline)
my $VERBOSE = 1; # 0..3
my $SHOWHELP = 0; # set/unset
my $SETTINGS = 0; # set/unset
my $CORRECT = 0; # correct errors?
my $BACKUP = 0; # keep backup-files?
my $BACKUPEXT = "bak"; # the extension for backup-files
my $BACKUPPATH = ""; # copy the backups to this path
my $DATESTYLE = "modified %D-%M-%Y"; # the date definition
my $SHORTYEAR = 0; # produce short years (bad)
my $NODATEERR = 0; # files without date causes errors?
my $MDATEERR = 0; # files with more than one date causes errors?
my $FORCEDATE = 0; # force a date in file?
my $NORECURS = 0; # recurs into subdirs?
my $NOBIN = 0; # don't perform binaries?
my @NODES = (); # files/paths to scan
my @REGEXS = (); # regex to match for file (no if empty)
## end of definition-section
## ----------------------------------------------------------------------------
## now follows some usage-information for the user
(my $progname = $0) =~ s!^.*/!!; # kill any path-information from my progname
my $short_usage = "Usage: $progname [--help] [options]... [file/path]... \n";
my $long_usage = <
END_OF_LONG_USAGE
## ----------------------------------------------------------------------------
## ok, now we get the information from the commandline
GetOptions(
"r|no-recurs" => \$NORECURS,
"m|match=s" => \@REGEXS,
"i|no-bin" => \$NOBIN,
"d|date-style=s" => \$DATESTYLE,
"y|short-year" => \$SHORTYEAR,
"u|mdate-err" => \$MDATEERR,
"e|nodate-err" => \$NODATEERR,
"f|force-date" => \$FORCEDATE,
"c|correct" => \$CORRECT,
"b|backup" => \$BACKUP,
"x|bak-ext=s" => \$BACKUPEXT,
"a|bak-path=s" => \$BACKUPPATH,
"h|help" => \$SHOWHELP,
"s|settings" => \$SETTINGS,
"v|verbose=i" => \$VERBOSE,
"<>" => sub { push @NODES, @_ }, # nonopts are files/paths
) or die $short_usage; # options-error
## ----------------------------------------------------------------------------
## some options need work now
$SHOWHELP and die $long_usage; # help wanted
@NODES or push @NODES, cwd(); # no path given -> use workdir
$BACKUPPATH and $BACKUPEXT = "overwritten by -a"; # -a overrides -x
$FORCEDATE and $NODATEERR = 1; # -f sets -e
my $YEARADD = $SHORTYEAR ? 0 : 1900; # short years are very bad
## ----------------------------------------------------------------------------
## build the formatstring, regex-string and extract order from given datestyle
##
## We scan the datestyle for %-directives. For that we try to match an '%'
## followed by either a char or the end of the line. If found and known we
## replace it with the corresponding regex/formatvar. If unknown we die. Note!
## If matched the end of the line we found a lonely '%' and that fails.
## Non-directives will stored unchanged (but quoted for the regex).
my $DATEREG = ""; # the regex to match a date
my $FORMSTRING = ""; # the format-string to printf a date
my @ORDER = (); # the order of directives
$_ = $DATESTYLE; # easy reading and prevent $DATESTYLE
while (/%(.|$)/) { # match the %-directives
$DATEREG .= quotemeta $`; # prematch
$FORMSTRING .= $`;
push @ORDER, $1; # store order
SWITCH: for ($1) {
/[dmtsh]/ and do {
$DATEREG .= "\\d{1,2}";
$FORMSTRING .= "%d";
last SWITCH;
};
/[DMTSH]/ and do {
$DATEREG .= "\\d{1,2}";
$FORMSTRING .= "%02d";
last SWITCH;
};
/y/ and do {
$DATEREG .= "\\d{2}";
$FORMSTRING .= "%d";
last SWITCH;
};
/Y/ and do {
$DATEREG .= "\\d{4}";
$FORMSTRING .= "%d";
last SWITCH;
};
/%/ and do {
$DATEREG .= "\\%";
$FORMSTRING .= "%%";
last SWITCH;
};
# if we reach this line we've found a unknown directive
die "unknown directive in datestyle \n";
};
$_ = $'; # postmatch for next test
}
$DATEREG .= quotemeta $_; # the rest of the line
$FORMSTRING .= $_;
## ----------------------------------------------------------------------------
## collect actual settings in a pretty formatted form
my $actual_settings = < $VERBOSE
-c --correct => $CORRECT
-b --backup => $BACKUP
-x --bak-ext => "$BACKUPEXT"
-a --bak-path => "$BACKUPPATH"
-d --date-style => "$DATESTYLE"
(formstring) => "$FORMSTRING"
(datereg) => "$DATEREG"
(order) => @ORDER
-y --short-years => $SHORTYEAR
(year add) => $YEARADD
-u --mdate-err => $MDATEERR
-e --nodate-err => $NODATEERR
-f --force-dates => $FORCEDATE
-r --no-recurs => $NORECURS
-i --no-bin => $NOBIN
files/paths => @NODES
-m --match => @REGEXS
END_OF_SETTINGS
$SETTINGS and die $actual_settings;
## ----------------------------------------------------------------------------
## ok, now are all things properly set or we dead
$VERBOSE and print " $progname started (".localtime().")\n";
$VERBOSE > 2 and print $actual_settings;
## ----------------------------------------------------------------------------
## scan the given paths for files and store names and informations:
##
## we need the date of last modification and the permissions (to restore it)
## the hash we build looks like:
## %all_files = { "a_file" => { "date" => "the_date", "perm" => "the_permission }, ... }
my %all_files = ();
## ----------------------------------------------------------------------------
## _add_file - adds files to the hash or returns silent
sub _add_file {
my $file = shift;
# First test for matching the given regex. If this fails we don't do any
# slowly filetest. :-)
if (@REGEXS) { # match regexs if given
my $file_ok = 0;
for (@REGEXS) {
last if $file_ok += ($file =~ /$_/)
}
$file_ok or return;
}
-f $file or return; # real files only
$NOBIN and -B $file and return; # handle binaries
my @stat = (stat($file)) or return;
# now the real inserts to the hash
$all_files{$file}{mtime} = $stat[9];
$all_files{$file}{perm} = $stat[2];
$VERBOSE > 1 and print " file added \n";
}
## ----------------------------------------------------------------------------
## this is the scan loop:
##
## if node is already a file we add it directly to the hash
## if node is a directory we scan this for files using the File::find routine
## in the find-routine we possible have to set the File::Find::prune
## to preserve against recursivly scanning
for my $node (@NODES) {
if (-d $node) {
$VERBOSE and print " scanning for files in \n";
find( sub {
$NORECURS and ( -d $File::Find::name)
and ($node ne $File::Find::name)
and ($File::Find::prune = 1);
_add_file($File::Find::name);
},
$node )
}
else {
_add_file($node);
}
}
$VERBOSE and print " found ", scalar keys %all_files, " file(s)\n";
## ----------------------------------------------------------------------------
## perform the tests
##
## for this we scan all files in hash for a date matching the given datestyle.
## if there is a date and this is not corresponding with the filedate this
## will be corrected if wished by user. During this we count the found dates
## per file and printing errors for files without or with multiple dates. Last
## it's possible to expand the file with the date of last modification if no
## date found. Very last we play a little bit with backup-logics.
my %STAT = (); # to store some statistics
for my $file (keys %all_files) {
$VERBOSE > 1 and print " checking file \n";
# setup some stuff for this file
my @lines_err = (); # lines with errors
my @lines_ok = (); # lines with correct dates
my $date_forced = 0; # not done yet
# the next is a little bit harder. we have to build a date in given
# style using the date of last modification of the file
#
# for this we get the information from mtime first
my ($sec, $min, $hour, $mday, $month, $year) = (localtime($all_files{$file}{mtime}))[0..5];
# OK, now we fill the @vals-array with the items in the order found
# in @ORDER. Then we do the formatted output using sprintf with our
# earlier builded formatstring.
my @vals = ();
for (@ORDER) {
/d/i and do { push @vals, $mday ; next };
/m/i and do { push @vals, $month + 1 ; next };
/y/i and do { push @vals, $year + $YEARADD ; next };
/t/i and do { push @vals, $min ; next };
/s/i and do { push @vals, $sec ; next };
/h/i and do { push @vals, $hour ; next };
}
my $last_modified = sprintf($FORMSTRING, @vals);
# open the original for reading and the new one if error-correcting is on
my $old = $file;
my $new = "$file.tmp.$$" if $CORRECT;
open(OLD, "< $old") or die "can't open $old: $!";
open(NEW, "> $new") or die "can't open $new: $!" if $CORRECT;
# now we scan the original line by line
while () {
# the logic behind the next loop was earlier described
# by Randal L. Schwartz in his WebTechniques-Columne which can be found
# at http://www.stonehenge.com/merlyn/col12.html.
#
# We try to match a date in given style. If this works we found a
# date. We store the line before the date (the pre-match) and set $_
# to everything after the match for the next test. The loop ends if
# no more hits found and so we walk the line looking for matches.
#
# The date itselfs is the match and is stored by Perl in $&.
#
# The so found date is compared with the filedate. If this is not equal
# we store the date of last modification otherwise the original date.
#
# After the loop ends we write the stored data (should be the original
# line with possible changed dates) and everything leaved in $_ to
# the new file.
my $line = "";
while (/$DATEREG/o) { # match the date
$line .= $`; # prematch
if ($& ne $last_modified) { # found date is incorrect
warn "error: false date in file at line .",
" found date: should be: \n";
$line .= $last_modified; # the match, but changed
push @lines_err, $.; # store linenumber
$STAT{"date(s) incorrect"}++;
}
else { # found date is correct
$line .= $&; # the orginal match
push @lines_ok, $.; # store linenumber
$STAT{"date(s) correct"}++;
}
$_ = $'; # postmatch for next test
}
(print NEW $line.$_) or die "can't write to $new: $!" if $CORRECT;
}
# scanning lines of original is complete now
# the new file should contain a copy of the original with adjusted dates
# handle no-date, multiple-date and force-date
# @lines_er and @lines_ok contains all numbers of lines with date
my $dates_found = @lines_err + @lines_ok;
$MDATEERR and $dates_found > 1
and warn "error: more than one date in file at lines \n"
and $STAT{"file(s) with multiple dates"}++;
$NODATEERR and ! $dates_found
and warn "error: no date in file \n"
and $STAT{"file(s) without any date"}++;
$FORCEDATE and $CORRECT
and ! $dates_found
and (print NEW "\n$last_modified\n")
and $date_forced++
and $STAT{"date(s) forced"}++;
# ok, we can close the original and the new one now
close(OLD) or die "can't close $old: $!";
close(NEW) or die "can't close $new: $!" if $CORRECT;
# if error-correcting is on we have now two files the original and
# a copy with possible adjusted dates.
#
# If no changes made we just unlink the copy.
#
# Otherwise we basicly have to rename the copy to original and restore
# the permission and the date of last modification of the original.
# But before doing that we backup the original if wanted. Therefore we
# copy the orginal to the backup and restore the permission and mtime.
# The location of the backup is either beside the file with a given
# extension (but at least one dot) or in the backuppath. In the second case
# the backuptree will be a copy of the original directory-structure and the
# file is named same like the original.
if ($CORRECT) {
if (@lines_err or $date_forced) {
if ($BACKUP) {
my $bak = "$file.$BACKUPEXT";
if ($BACKUPPATH) {
($bak = $BACKUPPATH.$file) =~ s(//)(/);
my $dir = dirname($bak);
-d $dir or mkpath([$dir]) or die "can't mkpath $dir: $!";
}
copy($old, $bak) or die "can't copy $old to $bak: $!";
chmod $all_files{$file}{perm}, $bak or die "can't chmod $bak: $!";
utime time, $all_files{$file}{mtime}, $bak or die "can't utime $bak: $!";
}
rename($new, $old) or die "can't rename $new to $old: $!";
chmod $all_files{$file}{perm}, $old or die "can't chmod $old: $!";
utime time, $all_files{$file}{mtime}, $old or die "can't utime $old: $!";
warn " note: file changed \n";
$STAT{"file(s) changed"}++;
}
else {
unlink $new or die "can't unlink $new: $!";
}
}
$STAT{"file(s) checked"}++;
$VERBOSE > 1 and @lines_ok
and print " found correct date(s) in file ",
"at line(s) \n";
}
## ----------------------------------------------------------------------------
## this is the end
$VERBOSE and do {
print " $progname ended (".localtime().")\n";
for (sort keys %STAT) { printf(" %8d $_\n", $STAT{$_}); }
}
## ----------------------------------------------------------------------------
## little nice self-test: last modified 16-07-1999 14:44:28
##
## to check use --datestyle="modified %D-%M-%Y %H:%T:%S"