#!/usr/bin/perl ## Author: Arvind Gopu ## Contact: http://peart.ucs.indiana.edu/contact.html ## Date: June 4 2003 ### CHANGE --- aug 02 ## This script extracts from a BAG cluster output file (prefix.cluster.result). ## Also creates a file which list only FINAL clusters -- no more splits (Default -- filename.real_clusters ## Creates a file of parent-child listings (how clusters are split) ## Use after doing fasta_all_all , f2b (or fastaAllAll_To_Bag) and then bag ## Usage: Just run the program without parameters to get a short description of options use Getopt::Long; GetOptions ("i=s" => \$infile, ## .result file from BAG 'o:s' => \$outfile, ## Output file name -- used only with -mergedclusters option 'P:s' => \$bag_prefix, ## File prefix used for BAG 'src:s' => \$srcfile, ## Source file to construct sequence files used with next par 'align!' => \$align, ## Use this flag if you want to multiply align the sequences 'clustersdir:s' => \$clusters_dir, ## Use this to specify a directory where clusters will be created 'clustalwpath:s' => \$clustalw_path, ## Use this to specify where clustalw exists 'mergedclusters!' => \$merged_clusters, ## Use this flag to build merged clusters -- requires .advive.final_merge file 'quiet!' => \$quiet, 'IBMSP' => \$ibm_sp, ## Is the program called from IBM SP ? -- does not allow redirection to /dev/null 'redirectsys:s' => \$sys_redirect, ); ## Global environment $special_char = "^"; ## Used to differentiate repeated sequences ## Checking for required options if ($infile eq "") { &usage ("create_seqfiles_for_clusters_align ERROR: Required Data file not given.\n"); exit; } if (defined ($align)) { $align = 1; } if (defined ($quiet)) { $quiet = 1; } else { $quiet = 0; } if ((!defined($bag_prefix)) && (!defined($merged_clusters))){ my @prefix_array = split (/\./, $infile); $bag_prefix = $prefix_array[0]; print "\n File prefix used for BAG not specified.. Using default: \"$bag_prefix\"..."; } if ($bag_prefix =~ /\.|_/) { print "\n Create_seqfiles_for_clusters.pl error: Please do not include . or _ in the BAG file prefix.. Given: $$clustalw_path ... Exiting now ...\n\n"; exit; } ## Setting redirection variables here and SP stuff my $general_redirect1 = ""; my $general_redirect2 = ""; if (!defined($ibm_sp)) { $general_redirect1 = "> create_seq_temp1.out"; ## Not used on 9/1/2003 if ($sys_redirect eq "") { $sys_redirect = ">& /dev/null"; } } else { if (($clustalw_path eq "") || (!defined($clustalw_path))) { $clustalw_path = "/bioinfo/spbin"; if ($quiet == 0) { print "\n Clustalw path not specified.. Using IBM SP default: $clustalw_path..."; } } } if ((!defined($clustalw_path)) && ($align == 1)) { $clustalw_path = "/usr/local/dna/bin"; print "\n Clustalw path not specified.. Using default: $$clustalw_path ...\n"; } if (defined($merged_clusters)) { $merged_clusters = 1; } else { $merged_clusters = 0; } if ((!defined($clusters_dir)) && ($merged_clusters == 0)) { $clusters_dir = $bag_prefix."clusters"; print "\n Clusters sub-directory not specified.. Using default: $clusters_dir...\n"; } ## GLOBALS used pretty early if ($merged_clusters == 0) { my $srcfile_read; my $sizefile = $infile; my %hsrcseqs; ## Will hold sequence sans seq_id my %hsrcseq_occured; ## Will hold value of # of times a seq has occured (in BAG output) my $cl_count = 0; my %hparent; ### Processing starts here $command = "mkdir $clusters_dir $sys_redirect"; ## Create a new sub dir "$clusters_dir" if it doesnt exist. $result = `$command`; open (SRCFILE, "$srcfile") or die "$0 Error: Opening file $srcfile \n $!"; while (my $line = ) { $srcfile_read = $srcfile_read.$line; } @src_seqs = split />/, $srcfile_read; ## Will BEGIN only at index 1 <<<<<---------------<<< ------ for (my $i =0; $i <= $#src_seqs; $i++) { # print "\n Seqs [$i]: ****", $src_seqs[$i], "****"; @seqid_others = split /\n/, $src_seqs[$i]; # print "\n Seqs ID [$i]: ****", $seqid_others[0], "****"; ## Try to create hashes separately for Key: seq_ids => Value sequences themselves sans seqIds @each_seq = split (/\n/, $src_seqs[$i]); ## Get only sequence sans seq_id in this my $just_sequence = $just_sequence; for (my $j =1; $j <= $#each_seq; $j++) { $just_sequence = $just_sequence.$each_seq[$j]; } $hsrcseqs{$seqid_others[0]} = $just_sequence; $hsrcseq_occured{$seqid_others[0]} = 0; } ## Done processing SRC file read in. open (INFILE, "$infile") or die "$0 Error: Opening file $infile \n $!"; $sizefile =~ s/result/size/; open (SIZEFILE, ">$sizefile") or die "$0 Error: Opening Size file $sizefile \n $!"; $/ = ""; ## reads one cluster at a time -- \n\n is delimiter while (my $whole_line = ) { ## Write sequence files for all clusters (irrespective of further splits ## The following code is *totally* dependent on BAG output result file format @cluster_lines = split /\n/, $whole_line; @cluster_id_others = split /CLUSTER /, $cluster_lines[0]; @cluster_id_others2 = split / size/, $cluster_id_others[1]; $cluster_id = $cluster_id_others2[0]; ## Used in hashing of parent - child later ## 9/30/2003 -- Grabbing cluster sizes and printing them to a .size file (essentially s/result/size/) $cluster_id_others2[1] =~ /=\s*(\d*).*/; print SIZEFILE "$cluster_id\t$1\n"; ## An individual cluster file in "$clusters_dir" sub dir for each END cluster in BAG output open (SEQFILE, ">$clusters_dir/$cluster_id.seq") or die "$0 Error: Opening file $clusters_dir/$cluster_id.seq \n $!"; ## First and last lines can be discarded for sequence file... for (my $i =1; $i < $#cluster_lines; $i++) { $cluster_lines[$i] =~ s/\s+/ /sg; ## Remove unwanted space @required_others = split / /, $cluster_lines[$i]; ## Will contains seqid (required) + junk ## index 1 will contain sequence id coz of preceding space print SEQFILE "\n>", $required_others[1]; ## Print seq_ids with > sign preceeding it for (my $occured = 0; $occured < $hsrcseq_occured{$required_others[1]}; $occured++) { print SEQFILE $special_char; ## Print # of ^ or whatever symbols according to how many times it has occured } print SEQFILE "\n", $hsrcseqs{$required_others[1]}; ## Get it from hash of sequences $hsrcseq_occured{$required_others[1]}++; ## Update occurence information } close (SEQFILE); if ($align == 1) { print " Aligning cluster $cluster_id now ... \n"; $align_command = "$clustalw_path/clustalw $clusters_dir/$cluster_id.seq -output=gde $sys_redirect"; $align_result = `$align_command`; } ## Try deleting these at the end of the process -- might help in avoiding recreation of some of these # system("rm $clusters_dir/$cluster_id.dnd"); #### DONT DO THIS! ### system("rm $clusters_dir/$cluster_id.seq"); } close (INFILE); close (SIZEFILE); } else { ### This is used when called with -mergedclusters option is used ## Following 3 variables for reading in bag...result file indexing on clusterid and seq_index my @result_lines_temp = (); my @result_lines = ( @result_lines_temp ); my %cluster_id_index_map = (); my $current_cluster_index = 0; ## This will be value to prev hash , key being original cluster_id open (INFILE, "$infile") or die "$0 Error: Opening file $infile \n $!"; $/ = "ENDCLUSTER"; ## reads one cluster at a time while (my $whole_line = ) { ## Retain order -- leading spaces, arti-s and ARTI-s and then multi-spaces $whole_line =~ s/^\s*//g; ## remove leading space (beginning of line) ## Handling special case which has SPLIT INTO on it if ($whole_line !~ /SPLIT/) { ## Ignore whatever is split further ... if ($whole_line =~ /arti|ARTI/) { my $seq_count = 0; my $arti_seq_count = 0; my $arti_seqs_temp = ""; ## print "\nWL ARTI : <$whole_line> ------ Will process more \n"; my @temp = split (/\n/, $whole_line); $whole_line = $temp[0]."\n"; ## Just add CLUSTER line for (my $i = 1; $i < $#temp; $i++) { if ($temp[$i] !~ /arti|ARTI/) { $whole_line = $whole_line.$temp[$i]."\n"; $seq_count++; } else { $arti_seqs_temp = $arti_seqs_temp.$temp[$i]."\n"; $arti_seq_count++; ## print "\nWL ARTI line : <", $temp[$i] , "> ------ Will Avoid "; } } if ($arti_seq_count >= ($#temp - 1)) { ## if it's all artis/ARTIS ... ## print "\n USELESS ARTI STUFF with arti count $arti_seq_count for cluster ", $temp[0], "\n"; } else { $whole_line = $whole_line.$arti_seqs_temp; ## print "\n Useful ARTI STUFF \n"; } $whole_line = $whole_line.$temp[$#temp]; } $whole_line =~ s/arti|ARTI//g; ## remove leading space (beginning of line) $whole_line =~ s/\s+/ /g; ## space separates everything within each cluster ### Split on space -- index 1: cluster id, index 3: size, index (4 to $#-1): seqs, ignore $# @clusterid_and_seqs = split (/ /, $whole_line); ## print "\nWL: <$whole_line>\n"; ## print " <$clusterid_and_seqs[1]> <$clusterid_and_seqs[3]> <$clusterid_and_seqs[4]> <$clusterid_and_seqs[5]> <", $clusterid_and_seqs[$#clusterid_and_seqs], ">\n"; if ($clusterid_and_seqs[1] ne "") { ## To avoid blank cluster id keys $cluster_id_index_map{$clusterid_and_seqs[1]} = $current_cluster_index; ## Map cluster id to clusterid_index for (my $i = 4; $i <= ($#clusterid_and_seqs - 1); $i++) { ## starts at 4 $result_lines[$current_cluster_index][$i-4] = $clusterid_and_seqs[$i]; ## Note $i-4 ## print " At [$current_cluster_index][", $i-4, "]: ", $result_lines[$current_cluster_index][$i-4]; } $current_cluster_index++; ## Increment cluster index } } } close (INFILE); ## Now @result_lines[][] contains all cluster-seqs foreach $key (keys %cluster_id_index_map) { ### print "\n key $key: ", $cluster_id_index_map{$key}; } my $final_merge_file = $infile; $final_merge_file =~ s/.result/.merge.advice.final_merge/; if (!defined($outfile)) { $outfile = $infile; $outfile =~ s/.result/.final_result/; if ($quiet == 0) { print "\n Final Cluster output file not specified.. Using default: $outfile...\n"; } } $/="\n"; open (MFILE, "$final_merge_file") or die "$0 Error: Opening file $final_merge_file \n $!"; open (OUTFILE, ">$outfile") or die "$0 Error: Opening file $final_merge_file \n $!"; my $final_cluster_id = 0; ## Starts at zero -- independent of original cluster id my %cluster_visited = (); my %seq_visited = (); ## used for printing my $final_output_line = ""; my $concat_cluster_id = ""; ## Will have merged clusterids my $seq_list = ""; my $size_cluster = 0; while (my $line = ) { $concat_cluster_id = ""; ## Reset $seq_list = ""; $size_cluster = 0; chomp $line; $line =~ s/^\s*//; ## remove leading space (beginning of line) ## print "\n NEW CLUSTER \n"; @sub_clusters = split (/ /, $line); $first_printing = 1; ## Everytime this will be set to one once for (my $i = 0; $i <= $#sub_clusters; $i++) { ## print " \n<$i>: <", $sub_clusters[$i], "> with fcid : <$final_cluster_id>"; if ($first_printing == 1) { $concat_cluster_id = $concat_cluster_id.$sub_clusters[$i]; $first_printing = 0; } $concat_cluster_id = $concat_cluster_id."-".$sub_clusters[$i]; $cluster_visited{$sub_clusters[$i]} = "YES"; for (my $j = 0; $j <= $#{ $result_lines[$cluster_id_index_map{$sub_clusters[$i]}] }; $j++) { # if ($seq_visited{$result_lines[$cluster_id_index_map{$sub_clusters[$i]}][$j]} ne "YES") { ## print " <$i><$j>: <", $result_lines[$cluster_id_index_map{$sub_clusters[$i]}][$j], ">\n"; $seq_visited{$result_lines[$cluster_id_index_map{$sub_clusters[$i]}][$j]} = "YES"; $seq_list = $seq_list."\n\t".$result_lines[$cluster_id_index_map{$sub_clusters[$i]}][$j]; $size_cluster++; # } } } $final_cluster_id++; $final_output_line = "\nCLUSTER $concat_cluster_id size= $size_cluster".$seq_list."\nENDCLUSTER\n"; if ($size_cluster > 0) { print OUTFILE $final_output_line; } else { ### print "\n Zero size concat cluster $concat_cluster_id with size $size \n"; } } close (MFILE); foreach $key (keys %cluster_id_index_map) { if ($cluster_visited{$key} ne "YES") { $seq_list = ""; $size_cluster = 0; ## print " Key $key Visited NO ?: <", $cluster_visited{$key}, ">\n"; $cluster_visited{$key} = "YES"; for (my $j = 0; $j <= $#{ $result_lines[$cluster_id_index_map{$key}] }; $j++) { if ($seq_visited{$result_lines[$cluster_id_index_map{$key}][$j]} ne "YES") { ## print " <$i><$j>: <", $result_lines[$cluster_id_index_map{$sub_clusters[$i]}][$j], ">\n"; $seq_visited{$result_lines[$cluster_id_index_map{$key}][$j]} = "YES"; $seq_list = $seq_list."\n\t".$result_lines[$cluster_id_index_map{$key}][$j]; $size_cluster++; } } $final_output_line = "\nCLUSTER $key size= $size_cluster".$seq_list."\nENDCLUSTER\n"; if ($size_cluster > 0) { print OUTFILE $final_output_line; } else { ### print "\n Zero size cluster $concat_cluster_id with size $size \n"; } } } close (OUTFILE); } if ($quiet == 0) { print "\n ... DONE ... \n\n"; } sub usage { my ($error_mesg) = @_; if ($error_mesg ne "") { print "\n $error_mesg \n"; } print " Usage: perl create_seqfiles_for_clusters.pl -i -s [-no_create_seq -o -align] Example usage: perl create_seqfiles_for_clusters.pl -i file.cluster.result -s all_seqs \n \n"; } ### @{$result_lines[0]} = @ttt; #### CHANGES: ### 8/2/2003: Alignment made an optional parameter -align ### no_create_seq options removed since it does not make sense anymore ### clusters subdir and clustalw path options added ### 8/3/2003: Added -mergedclusters option which creates seqs-result file called "infile.final_result" ### using .advice.final_merge file and .result file ### 8/7/2003: Changed space to "-" as the thingy separating merged subclusters in final_result file. ### 8/8/2003: Removed printing of clusters which are further split -- DUMB should have had this before ### Do not print clusters which are all-ARTIS or all-artis ### 8/31/2003: Clusters dir changed to $bag_prefix.clusters ### 9/1/2003 : Added in sys-redirect variables so it can work in IBM SP ! ### 9/31/2001: Added in .size file thingy -- prints size of each cluster in a file to be used by prepare*.pl later