#!/usr/bin/perl use POSIX qw(floor); $total_genes=0; my @InterGenesGFF; my $numInter = 0; my @InterGenesSeq; my $lx = 3; #NB: Read this in! my @B = qw/A C G T/; my($hdr,$seq,$word,@fw,$Nw,$b,$Lw,$L,$string,$scaffnum,@tot,@p,@px,%xmap,$N,$Ls); my($f1); my $outFilename; my $genomeFile; my $outFileSet=0; my $GFFset=0; my $PTTset=0; my $Seqset=0; #Xi Added here. my %count_hash =(); my %occr_hash = (); #Finished here. printf("Welcome to the SOMBRERO Background Extraction Script\nBy Shaun Mahony & David Hendrix\n"); $x=0; while($x<$#ARGV) { if($ARGV[$x] eq "-g") { unless (open(GENOME, $ARGV[$x+1])) { die "Cannot open file: $!"; } $genomeFile = $ARGV[$x+1]; }elsif($ARGV[$x] eq "-gff") { if($PTTset == 0) { unless (open(GENES, $ARGV[$x+1])) { die "Cannot open file: $!"; } GFF2Intergenic(); $GFFset=1; } }elsif($ARGV[$x] eq "-ptt") { if($PTTset == 0) { unless (open(GENES, $ARGV[$x+1])) { die "Cannot open file: $!"; } PTT2Intergenic(); $PTTset=1; } }elsif($ARGV[$x] eq "-seq") { unless (open(SEQS, $ARGV[$x+1])) { die "Cannot open file: $!"; } $Seqset=1; }elsif($ARGV[$x] eq "-x") { $lx = $ARGV[$x+1]; }elsif($ARGV[$x] eq "-out") { $outFilename = $ARGV[$x+1]; $outFileSet=1; } $x++; } $lx = $lx+1; if($x==0) #if it's still 0; there was never any args { PrintHelp(); } else { printf("Generating the %d-order background model. This may take a few moments.\n", ($lx-1)); if($outFileSet==0){ $outFilename = "out.back"; } open(OUT, ">", $outFilename) || die("Can't open output file: $!"); if($Seqset==1) { ReadSeq(); }else{ Intergenic2Seq(); } GenMCMC(); printf("Finished!\n"); } sub GFF2Intergenic { @linesG=; seek(GENES, 0, 0); #Seek to beginning of file while($_=) { #Count the total number of known genes $total_genes++; } $j=0; $i=0; $k=0; while($j<$total_genes) { @details = split ' ', $linesG[$j]; #format: "Name Source Feature start stop score strand frame $start = $details[3]; $stop = $details[4]; $strand = $details[6]; $i=$j+1; @details_next=split ' ', $linesG[$i]; $start_next = $details_next[3]; $stop_next = $details_next[4]; while($i<$total_genes && $stop_next<$stop) { $i++; @details_next=split ' ', $linesG[$i]; $start_next = $details_next[3]; $stop_next = $details_next[4]; } #by here, start_next should hold the next gene on from stop #so... take the region between as the intergenic. if($start_next<$stop){ #take no action... genes are overlapping } elsif($start_next-$stop>10) { $InterGenesGFF[$numInter] = $k."\tintergenic\tgene\t". ($stop+1) ."\t".($start_next-1) ."\t.\t+\t.\n"; $numInter++; $k++; } #Increment for next prediction $j++; } } sub PTT2Intergenic { @linesG=; #find the beginning of the gene annotations first $y=0; seek(GENES, 0, 0); #Seek to beginning of file while($_=) { #Count the total number of known genes @tmp = split /\s+/; unless(($tmp[2] eq "+") || ($tmp[2] eq "-")) { $y++; } $total_genes++; } $total_genes = $total_genes-$y; $j=$y; $i=$y; $k=0; while($j<$total_genes) { @details = split /\s+/, $linesG[$j]; #format: "Name Source Feature start stop score strand frame @ss = split /\.\./, $details[1]; $start = $ss[0]; $stop = $ss[1]; $strand = $details[2]; $i=$j+1; @details_next=split /\s+/, $linesG[$i]; @ss = split /\.\./, $details_next[1]; $start_next = $ss[0]; $stop_next = $ss[1]; while($i<$total_genes && $stop_next<$stop) { $i++; @details_next=split /\s+/, $linesG[$i]; @ss = split /\.\./, $details_next[1]; $start_next = $ss[0]; $stop_next = $ss[1]; } #by here, start_next should hold the next gene on from stop #so... take the region between as the intergenic. if($start_next<$stop){ #take no action... genes are overlapping } elsif($start_next-$stop>10) { $InterGenesGFF[$numInter] = $k."\tintergenic\tgene\t". ($stop+1) ."\t".($start_next-1) ."\t.\t+\t.\n"; $numInter++; $k++; } #Increment for next prediction $j++; } } sub Intergenic2Seq { my $contig; #Read the genome file first #first remove the header line (and check if it's a Genome file? while($line = ) { unless($line =~ />/) { chomp($line); $contig = $contig . $line; } } $i=0; while($i<$numInter) { $line = $InterGenesGFF[$i]; @details = split ' ', $line; #format: "Name Source Feature start stop score strand frame $start = $details[3]; $stop = $details[4]; #extract the sequence $InterGenesSeq[$i] = substr($contig, ($start-1), (($stop-$start)+1)); $i++; } } sub ReadSeq { $i=-1; while($line=) { if($line =~ />/) { #printf("Hello\n"); $i++; $InterGenesSeq[$i]=""; } else { chomp($line); $InterGenesSeq[$i]=$InterGenesSeq[$i].$line; } } $numInter = $i; } sub GenMCMC { $N=@B**$lx; setup(); readsequence(); printinfo(); } sub setup { my($i,$j,$L); undef %xmap; # Set up builds the primary arrays that will be used throughout. for($L=1;$L<=$lx;$L++) { $tot[$L]=0; for($i=0;$i<@B**$L;$i++) { $xmap{$L}{ numbword($i,$L) } = $i; $occr_hash{numbword($i,$L)} = 1; #Xi added here. By default the pseudocount is 1. $count_hash{$L}+=1;#Also need to add to Freq. $px[$L][$i]=1; $tot[$L]+=1; } } } sub readsequence { my($fg,$str,$n,$junk,$min,$i_min,$tot,$count); my($i,$j,$k,$L,@genes,$Ng,$Ns,@b,@notfound,@sortgenes,%gened); my($start,$stop); my(%seqnamemap,$seqname,$name,@a); # Now read in the data set that corresponds to the genomic sequence. $z=0; $f=0; #Xi added from here. # To calculate the overall occurence of each kmer in the ori sequence along with the revcom as well. while($f <= $numInter){ $seq = $InterGenesSeq[$f]; $seq =~ tr/a-z/A-z/; for($j=1;$j<=$lx;$j++){ $count_hash{$j} += 2*(length($seq)-$j+1); } #my @keys = keys %occr_hash; #foreach my $k (@keys){ # chomp $k; # my $size = length $k; #$count_hash{$size} = 2*(length($seq)-$size+1); for($k=1;$k<=$lx;$k++){ for ($i=0;$i/) { #$hdr=$_; #} else { $seq=$InterGenesSeq[$z]; $seq =~ tr/a-z/A-Z/; #@a=split(/ /,$hdr); #($junk,$name)=split(/\>/,$a[0]); for($i=0;$i1; $L--) { for($i=0;$i<@B**$L;$i++) { $px[$L][$i]/=$px[$L-1][$xmap{$L-1}{substr(numbword($i,$L),0,$L-1)}]; } } } sub printinfo { my($i,$j,$k); for($L=1;$L<=$lx;$L++) { for($i=0;$i<@B**$L;$i++) { print OUT "$i ", numbword($i,$L)," ".$px[$L][$i]."\t".$occr_hash{numbword($i,$L)}."\t".$occr_hash{numbword($i,$L)}/$count_hash{$L}."\n"; #Xi modified here. } } } sub revcmp { # Returns the reverse complement of the input sequence. $_=$_[0]; tr/acgtACGT/tgcaTGCA/; $_=reverse($_); return $_; } sub wordnumb { # for a give word, it gives its number. my($str)=@_; my($n,$q); $q=0; for($n=0;$n=0;$n--) { if($n>0) { push(@num,floor($innum / (@B**$n))); $innum = $innum % (@B**$n); } else { push(@num,$innum); } } $str = ""; for($n=0;$n<@num;$n++) { $str=$str . $B[$num[$n]]; } return $str; } sub PrintHelp{ #printf("\n\tArguments: \n\n\t-g [genome_file]: The name of the contiguous genome sequence file\n\n"); #printf("\t-gff [GFF_file]: The GFF file of gene locations for the genome\n"); #printf("\t-ptt [PTT_file]: The PTT file of gene locations for the genome\n"); #printf("\t\tUse -gff OR -ptt! Not both!\n\n"); printf("\t-seq [FastA_file]: FastA format sequences of intergenic regions\n"); printf("\t-x [len]: The length of the Markov Chain (default = 3)\n"); printf("\t-out [name]: Output filename (Default: out.back)\n"); }