#!/usr/bin/perl
use strict;
use warnings;
use Tie::File;
use Data::Dumper;
use Benchmark;
my $t0 = Benchmark->new;
# all files in the current folder with $ext will be input.
# Default $ext is "pileup"
# if entered, second user entered input will be set to $ext
my $ext = "pileup";
if(exists $ARGV[1]) {
$ext = $ARGV[1];
}
# open current directory & store filenames with $ext into @pileupfiles
opendir (DIR, ".");
my @pileupfiles = grep {-f && /\.$ext$/} readdir DIR;
my $dnasegment;
my $pos;
my $total;
my $g_total;
my @index; #hold current index for each tied file
my @totalfiles; #hold total files in each sub-index
# $filenum is iterator to cycle through all pileup files whose names are stored in pileupfiles
my $filenum = 0;
# @tied is an array holding all arrays of tied files
my @tied;
# array of the current line number for each @file,
my @linenum;
# tie each file to an array that is an element of the @tied array
while($filenum < scalar @pileupfiles) {
my @file;
tie @file, 'Tie::File', $pileupfiles[$filenum], recsep => "\n" or die;
push(@tied, [@file]);
# set each line's value of $linenum to 0
push(@linenum, 0);
$filenum++;
}
# open user list of dnasegments
open(LIST, $ARGV[0]);
# open file for output
open(OUT, ">>tempfile.tab");
while(<LIST>) {
$dnasegment = $_;
chomp $dnasegment;
my $exit = 0;
$pos = 1;
my %flag;
while(scalar(keys %flag) < scalar @tied) {
$total = 0;
$filenum = 0;
while($filenum < scalar @tied) {
if(exists $tied[$filenum][$linenum[$filenum]]) {
my @line = split(/\t/, $tied[$filenum][$linenum[$filenum]]);
#print $line[0], "\t", $line[1], "\t", $line[3], "\n\n";
if($line[0] eq $dnasegment) {
if($line[1] == $pos) {
$total += $line[3];
$linenum[$filenum]++;
$g_total += $line[3];
print OUT "$dnasegment\t$filenum\t$pos\t$line[3]\n";
}
} else {
$flag{$filenum} = 1;
}
} else {
#print $flag, "\n";
$flag{$filenum} = 1;
}
$filenum++;
}
if($total > 0) {
print OUT "$dnasegment\t$total\n";
}
$pos++;
}
}
close (LIST);
close(OUT);
my $t1 = Benchmark->new;
my $td = timediff($t1, $t0);
print timestr($td), "\n";
the above code takes all files with a default or user-entered file extension in a directory and calculates the total occurrences (column 4 of input files) for a location (column 2 of input files) from specific entries (column 1 of input files where column 1 matches a name included in a file provided on the command line). the layout of the files to be used by the program are: file 1:
Gm02 11896804 G 2 ., \'
Gm02 11896805 G 7 ......, U`
Gm02 11896806 G 3 .,. Sa
Gm02 11896807 T 2 ., U\
Gm02 11896808 T 2 ., ZZ
Gm02 11896809 T 2 ., ZZ
Gm02 11896810 T 2 ., B\
Gm02 11896811 G 3 .,^!, B]E
Gm02 11896812 A 3 T,, BaR
Gm02 11896822 G 3 .,, B`D
file 2:
Gm02 11896804 G 3 .,, \'
Gm02 11896805 G 7 ......, U`
Gm02 11896806 G 3 .,. Sa
Gm02 11896807 T 2 ., U\
Gm02 11896808 T 2 ., ZZ
Gm02 11896809 T 2 ., ZZ
Gm02 11896810 T 2 ., B\
Gm02 11896811 G 3 .,^!, B]E
Gm02 11896812 A 3 T,, BaR
Gm02 11896813 G 3 .,, B`D
file 3:
Gm02 11896804 G 3 .,, \'
Gm02 11896805 G 7 ......, U`
Gm02 11896806 G 3 .,. Sa
Gm02 11896807 T 2 ., U\
Gm02 11896808 T 2 ., ZZ
Gm02 11896809 T 2 ., ZZ
Gm02 11896810 T 2 ., B\
Gm02 11896811 G 3 .,^!, B]E
Gm02 11896812 A 3 T,, BaR
Gm02 11896833 G 3 .,, B`D
in this case the only command line argument passed to the program would be a text file with "Gm02" as its contents.
a hash is used to keep track of the locations that have been processed already. in the above example files, all three files will be checked to count from position 1 through 11896803 before it encounters the first values at position 11896804. this is to ensure that all positions are checked and summed in all files before incrementing position.
my question has to do with performance. i made the decision to use Tie::File because it was my understanding this would improve the performance because all of the files would not be read into memory. the real data to be processed by the program is many hundreds of thousands of lines in length multiplied by dozens of files. at this point, the time taken to run on example file1 alone and on all 3 example files are 42 wallclock secs (41.96 usr + 0.00 sys = 41.96 CPU) and 110 wallclock secs (109.76 usr + 0.00 sys = 109.76 CPU), respectively. any information as to why this program is running so slowly or recommendations on how to speed it up would be very appreciated.
edit 10:17PM EST: the output from the program is as follows:
Gm02 0 11896804 2
Gm02 1 11896804 3
Gm02 2 11896804 3
Gm02 8
Gm02 0 11896805 7
Gm02 1 11896805 7
Gm02 2 11896805 7
Gm02 21
Gm02 0 11896806 3
Gm02 1 11896806 3
Gm02 2 11896806 3
Gm02 9
Gm02 0 11896807 2
Gm02 1 11896807 2
Gm02 2 11896807 2
Gm02 6
Gm02 0 11896808 2
Gm02 1 11896808 2
Gm02 2 11896808 2
Gm02 6
Gm02 0 11896809 2
Gm02 1 11896809 2
Gm02 2 11896809 2
Gm02 6
Gm02 0 11896810 2
Gm02 1 11896810 2
Gm02 2 11896810 2
Gm02 6
Gm02 0 11896811 3
Gm02 1 11896811 3
Gm02 2 11896811 3
Gm02 9
Gm02 0 11896812 3
Gm02 1 11896812 3
Gm02 2 11896812 3
Gm02 9
Gm02 1 11896813 3
Gm02 3
Gm02 0 11896822 3
Gm02 3
Gm02 2 11896833 3
Gm02 3
Gm02 0 11896804 2
Gm02 1 11896804 3
Gm02 5
Gm02 0 11896805 7
Gm02 1 11896805 7
Gm02 14
Gm02 0 11896806 3
Gm02 1 11896806 3
Gm02 6
Gm02 0 11896807 2
Gm02 1 11896807 2
Gm02 4
Gm02 0 11896808 2
Gm02 1 11896808 2
Gm02 4
Gm02 0 11896809 2
Gm02 1 11896809 2
Gm02 4
Gm02 0 11896810 2
Gm02 1 11896810 2
Gm02 4
Gm02 0 11896811 3
Gm02 1 11896811 3
Gm02 6
Gm02 0 11896812 3
Gm02 1 11896812 3
Gm02 6
Gm02 1 11896813 3
Gm02 3
Gm02 0 11896822 3
Gm02 3
Gm02 0 11896804 2
Gm02 2
Gm02 0 11896805 7
Gm02 7
Gm02 0 11896806 3
Gm02 3
Gm02 0 11896807 2
Gm02 2
Gm02 0 11896808 2
Gm02 2
Gm02 0 11896809 2
Gm02 2
Gm02 0 11896810 2
Gm02 2
Gm02 0 11896811 3
Gm02 3
Gm02 0 11896812 3
Gm02 3
Gm02 0 11896822 3
Gm02 3
chomp $dnasegment
is scary. ;-) – simbabque