#!/usr/bin/perl -w
=head1 Genome Data Generator
This script creates a set of HTML include files that list the statistics for
the genomes in each of the genome groups. Genomes that are new to this version
of the Sprout will be specially marked. In order for this to work, both the
current and previous Sprout databases must be available on this machine.
This is one positional parameter: the name of a directory in which to place
the include files.
The currently-supported command-line options are as follows.
=over 4
=item user
Name suffix to be used for log files. If omitted, the PID is used.
=item trace
Numeric trace level. A higher trace level causes more messages to appear. The
default trace level is 2. Tracing will be directly to the standard output
as well as to a CIC file in the FIG temporary directory,
where I is the value of the B option above.
=item sql
If specified, turns on tracing of SQL activity.
=item background
Save the standard and error output to files. The files will be created
in the FIG temporary directory and will be named CIC and
CIC, respectively, where I is the value of the
B option above.
=item h
Display this command's parameters and options.
=item strict
If specified, strict groups will be used; otherwise, groups with a common primary name
will be combined into a single group. (The primary name of the group is the first
capitalized word.)
=item oddStyle
Style to use for odd rows of the table.
=item evenStyle
Style to use for even rows of the table.
=item tableStyle
Style to use for the table itself.
=item markerStyle
Style to use for small-text markers (e.g. NEW!)
=item linkCGI
Path to the CGI script for displaying detailed statistics.
=back
=cut
use strict;
use Tracer;
use DocUtils;
use TestUtils;
use Cwd;
use File::Copy;
use File::Path;
use Sprout;
use SFXlate;
use CGI qw(:standard);
use FIG;
no warnings 'once'; # only when coding
# Get the command-line options and parameters.
my ($options, @parameters) = StandardSetup([qw(Sprout ERDB) ],
{
strict => [0, 'keep related groups separate'],
oddStyle => ['odd', 'style for odd rows'],
evenStyle => ['even', 'style for even rows'],
tableStyle => ['genomestats', 'style for whole table'],
markerStyle => ['tinytext', 'style for markers'],
linkCGI => ['../FIG/genome_statistics.cgi',
'path to CGI script for detailed statistics'],
},
"",
@ARGV);
# Verify the directory name.
my $targetDir = $parameters[0];
if (! $targetDir) {
Confess("No target directory specified.");
} elsif (! -d $targetDir) {
Confess("Target directory $targetDir not found.");
} else {
# *Get the old Sprout.
my $oldSprout = SFXlate->new_sprout_only($FIG_Config::oldSproutDB);
# Extract the genome group data from the old Sprout.
my %oldGroupHash = $oldSprout->GetGroups();
if (! $options->{strict}) {
%oldGroupHash = Fix(%oldGroupHash);
}
# Get the new Sprout.
my $sprout = SFXlate->new_sprout_only();
my %newGroupHash = $sprout->GetGroups();
if (! $options->{strict}) {
%newGroupHash = Fix(%newGroupHash);
}
# Loop through the groups.
for my $groupID (keys %newGroupHash) {
Trace("Processing group $groupID.") if T(2);
# Get the genomes from the new hash.
my @newGenomes = @{$newGroupHash{$groupID}};
# Create a hash for finding if a genome is in the old group. If the entire group is
# new, we just use an empty hash.
my %oldGenomes = ();
if (exists $oldGroupHash{$groupID}) {
%oldGenomes = map { $_ => 1 } @{$oldGroupHash{$groupID}};
}
# Create the output file.
Open(\*GROUPFILE, ">$targetDir/$groupID.inc");
# Get the styles.
my ($tableStyle, $markerStyle, @rowStyle) = ($options->{tableStyle}, $options->{markerStyle},
$options->{evenStyle}, $options->{oddStyle});
# Start the table.
print GROUPFILE "

\n";
# Create the header row.
print GROUPFILE Tr( { class => 'odd' }, th(["Strain annotated in NMPDR",
"Genome size, bp",
"Protein Encoding Genes (PEGs)",
"Named genes in subsystems", # s0
"Named genes not in subsystems", # n0
"Hypothetical genes in subsystems", # s1
"Hypothetical genes not in subsystems", # n1
"RNAs",
])) . "\n";
# Set up some useful stuff for the four count columns.
my %linkParms = ( s0 => "nohypo_sub", n0 => "nohypo_nosub",
s1 => "hypo_sub", n1 => "hypo_nosub" );
my @columnTypes = ('s0', 'n0', 's1', 'n1');
# The data rows will be built next. We'll be putting them into a hash keyed by
# organism name. The hash enables us to spit them out sorted by name.
my %rows = ();
# Loop through the genomes in the new group.
for my $genomeID (@newGenomes) {
# Check to see if this genome is new.
my $new = (! exists $oldGenomes{$genomeID} ? "new " : "");
Trace("Processing ${new}genome $genomeID for $groupID.") if T(3);
# Get the strain name.
my $genomeName = $sprout->GenusSpecies($genomeID);
# If this is a new strain, build the HTML for the NEW! mark.
if ($new) {
$new = " NEW!";
}
# Get the genome length.
my $genomeLen = $sprout->GenomeLength($genomeID);
# Get the number of PEGs.
my $pegCount = $sprout->FeatureCount($genomeID, 'peg');
# Get the number of RNAs.
my $rnaCount = $sprout->FeatureCount($genomeID, 'rna');
# Now we have four categories of features to work with, for each
# combination of named or hypothetical vs. in-subsystem or
# not-in-subsystem. First, we get all of the feature assignments for
# the genome.
my $assignHash = $sprout->GenomeAssignments($genomeID);
# Next, we get all of the features in the genome that belong to a
# subsystem. This involves a query via the subsystem spreadsheet.
my %ssHash = map { $_ => 1 } $sprout->GetFlat(['IsGenomeOf', 'ContainsFeature'],
"IsGenomeOf(from-link) = ?",
[$genomeID], 'ContainsFeature(to-link)');
# Create a hash to track the four categories. "s" or "n" indicates
# in or out of a subsystem. "1" or "0" indicates hypothetical or
# real.
my %counters = ( s0 => 0, n0 => 0, s1 => 0, n1 => 0 );
# Also keep a count of all the features.
my $totalFeatures = 0;
# Loop through the assignments.
for my $fid (keys %{$assignHash}) {
# Form the counter key.
my $ss = ( exists $ssHash{$fid} ? "s" : "n" ) . FIG::hypo($assignHash->{$fid} );
# Record this feature.
$counters{$ss} += 1;
$totalFeatures++;
}
Trace("$totalFeatures total features found for $genomeID.") if T(3);
# We have all our data. Next we need to compute the percentages and the links.
# First, the link stuff.
my $linkPrefix = "$options->{linkCGI}?user=\&genome=$genomeID&SPROUT=1&request=";
# Now format the counters and percentages.
for my $type (keys %linkParms) {
$counters{$type} = a( { href => "$linkPrefix$linkParms{$type}" },
sprintf("%d(%.1f%%)", $counters{$type},
Tracer::Percent($counters{$type}, $totalFeatures)));
}
my @counterValues = map { $counters{$_} } @columnTypes;
# Create the row text. We use a list reference to distribute the TD tag
# across all the cells.
my $rowHtml = td(["$genomeName$new", $genomeLen, $pegCount,
@counterValues, $rnaCount,
]);
# Put it in the row hash.
$rows{$genomeName} = $rowHtml;
}
# Set up a counter.
my $rowCount = 0;
# Now we have all the rows set up. We want to sort them and then
# write them to the output file. First, we create a variable
# we can use to select the even or odd style.
my $rowType = 0;
# Loop through the rows.
for my $rowID (sort keys %rows) {
# Format the row.
print GROUPFILE Tr( { class => $rowStyle[$rowType] }, $rows{$rowID} ) . "\n";
# Flip the row type.
$rowType = 1 - $rowType;
# Count the row.
$rowCount++;
}
# All done, terminate the table and close the file.
print GROUPFILE "

\n";
close GROUPFILE;
Trace("$rowCount genomes processed.") if T(2);
}
# We're all done.
Trace("Processing complete.") if T(2);
}
=head3 Fix
C<< my %fixedHash = Fix(%groupHash); >>
Prepare a genome group hash for processing. Groups with the same primary name will be combined.
The primary name is the first capitalized word in the group name.
=over 4
=item groupHash
Hash to be fixed up.
=item RETURN
Returns a fixed-up version of the hash.
=back
=cut
sub Fix {
# Get the parameters.
my (%groupHash) = @_;
# Create the result hash.
my %retVal = ();
# Copy over the genomes.
for my $groupID (keys %groupHash) {
# Make a safety copy of the group ID.
my $realGroupID = $groupID;
# Yank the primary name.
if ($groupID =~ /([A-Z]\w+)/) {
$realGroupID = $1;
}
# Append this group's genomes into the result hash.
Tracer::AddToListMap(\%retVal, $realGroupID, @{$groupHash{$groupID}});
}
# Return the result hash.
return %retVal;
}
1;