-
Notifications
You must be signed in to change notification settings - Fork 0
/
tapscan_classify_v4.76.pl
executable file
·932 lines (765 loc) · 30.2 KB
/
tapscan_classify_v4.76.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
#!/usr/bin/perl
use strict;
use warnings;
use File::Basename;
my $tapscan_version = "v4.76";
print "Running TAPscan Classify version $tapscan_version \n\n";
# Written by Gerrit Timmerhaus (gerrit.timmerhaus@biologie.uni-freiburg.de).
# Changes included by Kristian Ullrich, Per Wilhelmsson, Romy Petroll and Saskia Hiltemann.
# Script to extract all detected domains out of a hmmsearch results file and classify the families of all used proteins based on these domains.
# The classification depends on a table which contains all known classification rules for the protein families of interest and on specific coverage values defined for every domain.
# The script provides three outputs, namely output.1, output.2 and output.3. The output files are tables in ";"-delimited format.
# The structure of output.1 is: "sequence ID ; TAP family ; number of classifications ; domains".
# Output.3 shares in principle the same structure as output.1, except that subfamilies are considered. ("sequence ID ; TAP family ; Subfamily ; number of classifications ; domains")
# The superior TAP family is specified first, followed by the subfamily. If a TAP family has no subfamily, the TAP family is specified first and then a "-".
# The structure of output.2 is: "TAP family";"number of detected proteins".
# More than one entry for a protein is possible because the classification rules may allow more than one classification.
#
# The script must be startet with the arguments <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <"filter" if desired>
if (!@ARGV or ($ARGV [0] eq "-h") or ($ARGV [0] eq "-help")) {
print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamily classifications file> <\"filter\" (if desired)>\n\n";
exit;
}
# hmmsearch_output: domtblout file
my $hmmsearch_output = $ARGV [0];
# decision_table: rules file
my $decision_table = $ARGV [1];
# family_classifications: output.1
my $family_classifications = $ARGV [2];
# family_statistics: output.2
my $family_statistics = $ARGV [3];
# subfamily_classifications: output.3
my $subfamily_classifications = $ARGV [4];
# domspec_cuts: coverage values file
my $domspec_cuts = $ARGV [5];
# gene_model_filte: filter for ARATH and ORYSA
my $gene_model_filter = $ARGV [6];
# get basename for output files
my($basename, $dirs, $suffix) = fileparse($hmmsearch_output, qr/\.[^.]*/);
if ($family_statistics eq "") {
print "Usage: extract.and.classify.pl <hmmsearch output file> <classification rules> <output classifications file> <output family statistics file> <output subfamil classifications file> <\"filter\" (if desired)>\n\n";
exit;
}
if ($gene_model_filter and $gene_model_filter eq "filter") {
print "\nGene model filter is activated. It only works for TAIR (Arabidopsis) and TIGR (Rice) proteins up to now\n";
}
# Array where the $hmmsearch-output/domtblout file will be stored
my @output = ();
# Array with domain-specific coverage values
my @cuts = ();
# Array with rules
my @dec_table = ();
# Counter for the number of detected domains in the hmmsearch output file
my $entry_counter = 0;
# Containes the actual result for a query sequence
my $akt_entry = "";
# Used to define query entry to ignore similar domains
my $whole_entry = "";
# Includes the final entries after ignoring similar domains
my @results_of_extraction = ();
# Used to define query entry to ignore similar domains
my $extracted_domain = "";
# Used to define query entry to ignore similar domains
my $present = "";
# Used to define query entry to ignore similar domains
my $protein = "";
my $lek = "";
############################################
### 1. Read in the hmmsearch output file ###
############################################
print "\n*** reading in $hmmsearch_output ***\n\n";
@output = get_file_data("$hmmsearch_output");
print "*** Parsing $hmmsearch_output ***\n\n";
# If wrong format exit the program, ninth row from the end
if ($output [-9] !~ /^# Program: hmmsearch.*/) {
print "wrong file format. $hmmsearch_output has to be a hmmsearch output file\n\n";
exit;
}
### 1.1 Trim input file (hmmsearch) to fit script. Erase lines starting with #.
@output = grep !/^#.*/, @output;
# Read domain-specific coverage values file
# domname cut-off
# EIN3 0.5874125874
open(FILENAME,$domspec_cuts);
my %cuts = map { chomp; split /\t/ } <FILENAME>;
### 1.2 Use first (prot name), third (domain name), eight (dom evalue), fourteenth (# overlapping) and fifteenth (# domain) column of data input(array element).
foreach my $output (@output) {
$output =~ /^(\S+)\s+\S+\s+\S+\s+(\S+)\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+\S+\s+\S+\s+(\S+)\s+\S+\s+\S+\s+\S+\s+(\S+)\s+(\S+)\s+.*/;
$lek = (($6-$5)+1)/$3;
$output = $1."\t".$2."\t".$4."\t".$lek;
}
# Stucture:
# ARATHwo_AT2G03430.1 Ank 1.8e-08 0.93
# ARATHwo_AT2G03430.1 Ank 7.8e-10 0.95
# ARATHwo_AT2G03430.1 Ank 1.2e-08 0.93
# ARATHwo_AT2G03430.1 Ank 1.2e-08 0.63
################################
### 2. Modify the input data ###
################################
### 2.1 Run through length_cut_off loop (throws out entries below coverage values)
my @cutarray;
open(FH, '>', "${basename}_filtered_sequences.txt") or die $!;
print FH "Below are the sequences that were filtered out, along with the reason";
foreach my $output (@output) {
$output =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/;
if ($4 > $cuts{$2}) {
push (@cutarray, $output);
}
else{
print FH "$output --> did not meet coverage cutoff of $cuts{$2} (cov was $4)\n"
}
}
close(FH);
# End up with (prot name) (domain name) (dom evalue) (overlapping)
# Structure:
# ARATHwo_AT2G03430.1 Ank 1.8e-08 0.93
# ARATHwo_AT2G03430.1 Ank 7.8e-10 0.95
# ARATHwo_AT2G03430.1 Ank 1.2e-08 0.93
### 2.2 Retain hit with the lowest evalue
# String for the previous protein name
my $old_prot = "";
# Coordinates for array filling
my $x = -1;
my $y = 1;
my $scoreeval;
my @newarray;
foreach my $cutarray (@cutarray) {
$cutarray =~ /^(\S+)\t+(\S+)\t(\S+)\t(\S+)/;
$akt_entry = $1.$2;
# If the protein contains more than one of the same domain
if ($old_prot eq $akt_entry) {
if ($3>$scoreeval) {
$y++;
$newarray[$x] = $1."\t".$2."\t".$3."\t".$y;
}
else {
$y++;
$newarray[$x] = $1."\t".$2."\t".$scoreeval."\t".$y;
}
}
# If the protein is new reset $j and push the entry in the next array
else {
$y = 1;
$x++;
$newarray[$x] = $1."\t".$2."\t".$3."\t".$y;
}
$old_prot = $akt_entry;
$scoreeval = $3
}
# Final structure:
# ARATHwo_AT2G03430.1 Ank 7.8e-10 3
### 2.3 Then sort @output primarily on prot name($1) and then secondarily on eval($3) to fit the "similar"-procedure.
@newarray = sort { (split '\t', $a)[0] cmp (split '\t', $b)[0] || (split '\t', $a)[2] <=> (split '\t', $b)[2] } @newarray;
# Flags for merged domains
my $ARR_B_counter = 0;
my $bZIP_counter = 0;
my $ET_counter = 0;
# Flags for similar domain omission
my $WD40domain = 0; # modified for FIE rule
my $FIEdomain = 0; # --------'''''--------
my $MYBdomain = 0;
my $G2domain = 0;
my $YBdomain = 0;
my $YCdomain = 0;
my $PHDdomain = 0;
my $Alfindomain = 0;
my $Dr1domain = 0;
my $GATAdomain = 0;
my $Dofdomain = 0;
my $C1domain = 0;
foreach my $line (@newarray) {
# Reset the flags for domain omission
if ($line !~ /^$protein.*$/ ) {
$WD40domain = 0;
$FIEdomain = 0;
$MYBdomain = 0;
$G2domain = 0;
$YBdomain = 0;
$YCdomain = 0;
$PHDdomain = 0;
$Alfindomain = 0;
$Dr1domain = 0;
$GATAdomain = 0;
$Dofdomain = 0;
$C1domain = 0;
}
# Get the Query entry
$line =~ /^(\S+)\s+(\S+)\s+(\S+)\s+(\S+)/;
$protein = "$1";
$extracted_domain = $2;
$present = "$4"; # FIX! number for many Myb
$whole_entry = $1.";".$2.";".$3.";".$4;
# Ignore FIE if it is not better scored than WD40
if ($extracted_domain eq "WD40" and $FIEdomain == 0) {$WD40domain = 1;}
# Ignore similar domains part 1
if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 0) {$MYBdomain = 1;}
if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 0) {$G2domain = 1;}
if ($extracted_domain eq "PHD" and $Alfindomain == 0 and $C1domain == 0) {$PHDdomain = 1;}
if ($extracted_domain eq "Alfin-like" and $PHDdomain == 0 and $C1domain == 0) {$Alfindomain = 1;}
if ($extracted_domain eq "C1_2" and $Alfindomain == 0 and $PHDdomain == 0) {$C1domain = 1;}
if ($extracted_domain eq "GATA" and $Dofdomain == 0) {$GATAdomain = 1;}
if ($extracted_domain eq "zf-Dof" and $GATAdomain == 0) {$Dofdomain = 1;}
# Ignore similar domains part 2
if ($extracted_domain eq "FIE_clipped_for_HMM" and $WD40domain == 1) {next;}
if ($extracted_domain eq "Myb_DNA-binding" and $G2domain == 1) {next;}
if ($extracted_domain eq "G2-like_Domain" and $MYBdomain == 1) {next;}
if ($extracted_domain eq "PHD" and ($Alfindomain == 1 or $C1domain == 1)) {next;}
if ($extracted_domain eq "Alfin-like" and ($PHDdomain == 1 or $C1domain ==1)) {next;}
if ($extracted_domain eq "C1_2" and ($PHDdomain == 1 or $Alfindomain == 1)) {next;}
if ($extracted_domain eq "GATA" and $Dofdomain == 1) {next;}
if ($extracted_domain eq "zf-Dof" and $GATAdomain == 1) {next;}
# Save the entry
push @results_of_extraction, $whole_entry;
$entry_counter++;
}
print "$entry_counter domain matches were found in $hmmsearch_output\n\n";
# Sort the entries:
my @sorted_results_of_extraction = @results_of_extraction;
print "*** Preparing hmmsearch results for classification ***\n\n";
# This array will be filled with the sorted proteins
my $array_of_arrays = [];
# String for the previous protein name
my $old_entry = "";
# Coordinates for array filling
my $i = -1;
my $j = 0;
@dec_table = get_file_data("$decision_table");
if ($dec_table [0] !~ /^[^;]+;[^;]+;[^;]/) {
print "wrong file format. $decision_table has to be in the format family;domain;type\n\n";
exit;
}
# Formate the entries
foreach my $domain_entry (@sorted_results_of_extraction) {
$domain_entry =~ /^([^;]+);/;
$akt_entry = $1;
# If the protein has more than one domain push it behind the old entry
if ($old_entry eq $akt_entry) {
$j++;
$array_of_arrays->[$i][$j] = $domain_entry;
}
# If the protein is new reset $j and push the entry in the next array
else {
$j = 0;
$i++;
$array_of_arrays->[$i][$j] = $domain_entry;
}
$old_entry = $akt_entry;
}
$old_entry = "";
# Remember the number of entries to use them later
my $number_of_proteins = $i+1;
print "$number_of_proteins proteins were found in $hmmsearch_output\n";
$number_of_proteins--;
#########################################
### 3. Get the classification entries ###
#########################################
# Make a list of all families in the classification rules file
my @liste_alle_familien = ('0_no_family_found');
my $ARR_B_in_list = 0;
my $bZIP_in_list = 0;
my $ET_in_list = 0;
my $array_of_classifications = [];
$i = -1;
$j = 0;
my @classifications_input = get_file_data("$decision_table");
# Sort the classifications
my @classifications = sort @classifications_input;
foreach my $classification (@classifications) {
$classification =~ /^([^;]+);/;
$akt_entry = $1;
# If the family has more than one domain entry push it behind the old entry
if ($old_entry eq $akt_entry) {
$j++;
$array_of_classifications->[$i][$j] = $classification;
}
# If the family is new push it in the next array and reset $j
else {
$j = 0;
$i++;
$array_of_classifications->[$i][$j] = $classification;
# Add family to list of all families in the classification rules
push(@liste_alle_familien,$akt_entry);
# Add hardcoded "OR" defined families to list of families if subfamily is present
# Do this just once for each family
if (($akt_entry eq "GARP_ARR-B_Myb") or ($akt_entry eq "GARP_ARR-B_G2")) {
if ($ARR_B_in_list == 0) {
push(@liste_alle_familien,"GARP_ARR-B");
$ARR_B_in_list++;
}}
if (($akt_entry eq "bZIP1") or ($akt_entry eq "bZIP2") or ($akt_entry eq "bZIPAUREO") or ($akt_entry eq "bZIPCDD")) {
if ($bZIP_in_list == 0) {
push(@liste_alle_familien,"bZIP");
$bZIP_in_list++;
}}
if (($akt_entry eq "HRT") or ($akt_entry eq "GIY_YIG")) {
if ($ET_in_list == 0) {
push(@liste_alle_familien,"ET");
$ET_in_list++;
}}
}
$old_entry = $akt_entry;
}
my $number_of_classifications = $i+1;
print "$number_of_classifications different families were found in $decision_table\n\n";
$number_of_classifications--;
print "*** begin classification ***\n\n";
# Create the output files (output.1 and output.3)
my $outputfile = "$family_classifications";
my $subfamilyoutput = "$subfamily_classifications";
unless (open(FAMILY_CLASSIFICATIONS, ">$outputfile")) {
print "Cannot open file \"$outputfile\" to write to!!\n\n";
exit;
}
unless (open(SUBFAMILY_CLASSIFICATIONS, ">$subfamilyoutput")) {
print "Cannot open file \"$subfamilyoutput\" to write to!!\n\n";
exit;
}
print FAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n";
print SUBFAMILY_CLASSIFICATIONS "family classifications for $hmmsearch_output\n";
# Counter for classified families
my $classified_families = 0;
# Counter for unclassified families
my $unclassified_families = 0;
# Counter for the different amount of entries in classification possibilities
my @entry_counter = [];
# Counter for the possible classifications for the current protein
my $possibilities = 0;
# For later famliy statistics
my @family_list = [];
# This array will be filled with the result. Later written in output.1 and output.3
my @family_classifications_file = [];
my @subfamily_classifications_file = [];
# One time for every protein in array of arrays
for (my $ii = 0; $ii <= $number_of_proteins; $ii++) {
my $jj = 0;
$possibilities = 0;
# Flag to mark that any family was found for the current protein
my $member_found = 0;
# In this string all the domains of one protein will be stored
# The ; signs are important to mark a single entry (for example to differ AP2 and TF_AP2)
my $domains = ";";
# Get the domains in all elements of the current protein array:
while ($array_of_arrays->[$ii][$jj]) {
$array_of_arrays->[$ii][$jj] =~ /([^;]+);[^;]+;(\d+)$/;
# For discrimination between MYB and MYB-related. If more than 1 MYB domains was found replaced "Myb_DNA-binding" with "two_or_more_Myb_DNA-binding"
# Same for LIM
if ($1 eq "Myb_DNA-binding" and $2 eq 2) {
$domains .= "MYB-2R;Myb_DNA-binding;";
}
elsif ($1 eq "Myb_DNA-binding" and $2 eq 3) {
$domains .= "MYB-3R;Myb_DNA-binding;";
}
elsif ($1 eq "Myb_DNA-binding" and $2 eq 4) {
$domains .= "MYB-4R;Myb_DNA-binding;";
}
elsif ($1 eq "LIM" and $2 > 1) {
$domains .= "two_or_more_LIM;LIM;";
}
else {
$domains .= "$1;";
}
$jj++;
}
#########################
### 4. Classification ###
#########################
# Get the name of the actual protein for the output after classification:
$array_of_arrays->[$ii][0] =~ /^([^;]+);/;
$akt_entry = $1;
# Variables for double classification decisions:
# Here the shoulds for the actual family will be stored
my $shoulds_act_family = ";";
# Here the shoulds of the previous CLASSIFIED family will be stored
my $shoulds_prev_family = "";
# This loop will be made for every family entry in output.1 and output.3
for ($i = 0;$i <= $number_of_classifications; $i++) {
$j = 0;
# Flag to mark if the protein is a possible member of the current family
my $possible_member = 0;
# Reset the domain entries for the current family
$shoulds_act_family = ";";
# This loop will be made for every domain classification entry for the current family
while ($array_of_classifications->[$i][$j]) {
# For "should" entries
if ($array_of_classifications->[$i][$j] =~ /;should$/) {
# Get the domain from the classification entry
$array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/;
my $domain_classification_entry = $1;
# Check if the protein might be a member of the current family. If yes check the next entry
if ($domains =~ /;$domain_classification_entry;/) {
$possible_member = 1;
$j++;
# All matched domains for the current family are stored here:
$shoulds_act_family .= "$domain_classification_entry;";
next;
}
$possible_member = 0;
$j++;
last;
}
# For "should not" entries
if ($array_of_classifications->[$i][$j] =~ /;should not$/) {
# Get the domain from the classification entry
$array_of_classifications->[$i][$j] =~ /^[^;]+;([^;]+);[^;]+/;
# Check if the protein has the domain. If yes go to the next family
my $domain_check = $1;
if ($domains =~ /;$domain_check;/) {
$possible_member = 0;
last;
}
# If not check the next entry
else {
$j++;
next;
}
}
# This happens when there is no "should" or "should not" entry at the right position
print "error in classification entry $array_of_classifications->[$i][$j]\n"
}
# If the check was successful print the found classification
if ($possible_member) {
$array_of_classifications->[$i][0] =~ /^([^;]+);/;
my $currentFamily = $1;
$possibilities++;
# The double classified proteins solution:
if ($possibilities > 1) {
# Fill an array with the current domains in $domains
my @domains_array = [];
my $number_of_domains = ($domains =~ tr/;/;/)-1;
my $tempdomains = $domains;
for (my $domain_counter = 0;$domain_counter != $number_of_domains;$domain_counter++) {
$tempdomains =~ /^;([^;]+);/;
$domains_array[$domain_counter] = $1;
# Remove the inserted domain from domains
$tempdomains =~ s/^;[^;]+;/;/;
}
# Compare the two possible families (actual and previous family)
my $array_counter = 0;
while ($array_counter < $number_of_domains) {
if ($shoulds_prev_family =~ /$domains_array[$array_counter]/) {
if ($shoulds_act_family =~ /$domains_array[$array_counter]/) {
# If actual domain is in previous and actual family go to next domain
$array_counter++;
next;
}
# Protein was right classified in the previous classification. The actual entry will be ignored
$possibilities--;
last;
}
# If actual domain is in the actual family but not in the previous. The actual classification is right
if ($shoulds_act_family =~ /$domains_array[$array_counter]/) {
pop @family_classifications_file;
pop @subfamily_classifications_file;
# Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B"
if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){
push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n";
push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n";
if ($ARR_B_counter == 0) {
push @liste_alle_familien, 'GARP_ARR-B';
$ARR_B_counter++;
}
}
elsif ( ($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")) {
push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n";
push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n";
if ($bZIP_counter == 0) {
push @liste_alle_familien, 'bZIP';
$bZIP_counter++;
}
}
elsif ( ($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")) {
push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n";
push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n";
if ($ET_counter == 0) {
push @liste_alle_familien, 'ET';
$ET_counter++;
}
}
else {
push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n";
push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n";
}
$possibilities--;
last;
}
$array_counter++;
}
}
# Normal enty: if the family is classified for one family write it in the array
else {
# Store the matched domains of the actual family for respectively later doubly classification decision
$shoulds_prev_family = $shoulds_act_family;
# Store the entry in the output array
# Merge "GARBP_ARR-B_Myb" and "GARP_ARR-B_G2" to "GARP_ARR-B"
# Add GARP_ARR-B to @liste_alle_familien ONCE
if ( ($currentFamily eq "GARP_ARR-B_Myb") or ($currentFamily eq "GARP_ARR-B_G2")){
push @family_classifications_file, "$akt_entry;GARP_ARR-B;$possibilities$domains\n";
push @subfamily_classifications_file, "$akt_entry;GARP_ARR-B;-;$possibilities$domains\n";
if ($ARR_B_counter == 0) {
push @liste_alle_familien, 'GARP_ARR-B';
$ARR_B_counter++;
}
}
elsif (($currentFamily eq "bZIP1") or ($currentFamily eq "bZIP2") or ($currentFamily eq "bZIPAUREO") or ($currentFamily eq "bZIPCDD")){
push @family_classifications_file, "$akt_entry;bZIP;$possibilities$domains\n";
push @subfamily_classifications_file, "$akt_entry;bZIP;-;$possibilities$domains\n";
if ($bZIP_counter == 0) {
push @liste_alle_familien, 'bZIP';
$bZIP_counter++;
}
}
elsif (($currentFamily eq "HRT") or ($currentFamily eq "GIY_YIG")){
push @family_classifications_file, "$akt_entry;ET;$possibilities$domains\n";
push @subfamily_classifications_file, "$akt_entry;ET;-;$possibilities$domains\n";
if ($ET_counter == 0) {
push @liste_alle_familien, 'ET';
$ET_counter++;
}
}
else {
push @family_classifications_file, "$akt_entry;$currentFamily;$possibilities$domains\n";
push @subfamily_classifications_file, "$akt_entry;$currentFamily;-;$possibilities$domains\n";
}
$classified_families++;
}
# Flag to mark that any family was found for the current protein
$member_found = 1;
}
}
if ($possibilities == 0) {
# If no family was found for the current protein give this out
push @family_classifications_file, "$akt_entry;0_no_family_found;0$domains\n";
push @subfamily_classifications_file, "$akt_entry;0_no_family_found;-;0$domains\n";
$unclassified_families++;
}
# Increment the number of possible members in the array at the right position
else {
my $temp = $entry_counter[$possibilities];
$temp++;
$entry_counter[$possibilities] = $temp;
}
}
# Define for which TAP families also subfamilies may be present:
foreach my $subfamily_classifications_file (@subfamily_classifications_file) {
# If "C2H2" was assigned to a sequence, write "C2H2;C2H2" to output.3 as TAP family and subfamily, respectively.
if ($subfamily_classifications_file =~ /C2H2/ ) {
$subfamily_classifications_file =~ s/C2H2;-/C2H2;C2H2/
}
# If "C2H2_IDD" was assigned to a sequence, write "C2H2;C2H2_IDD" to output.3 as TAP family and subfamily, respectively.
if ($subfamily_classifications_file =~ /C2H2_IDD/ ) {
$subfamily_classifications_file =~ s/C2H2_IDD;-/C2H2;C2H2_IDD/
}
if ($subfamily_classifications_file =~ /bHLH/ ) {
$subfamily_classifications_file =~ s/bHLH;-/bHLH;bHLH/
}
if ($subfamily_classifications_file =~ /bHLH_TCP/ ) {
$subfamily_classifications_file =~ s/bHLH_TCP;-/bHLH;bHLH_TCP/
}
if ($subfamily_classifications_file =~ /NF-YA/ ) {
$subfamily_classifications_file =~ s/NF-YA;-/NFY;NF-YA/
}
if ($subfamily_classifications_file =~ /NF-YB/ ) {
$subfamily_classifications_file =~ s/NF-YB;-/NFY;NF-YB/
}
if ($subfamily_classifications_file =~ /NF-YC/ ) {
$subfamily_classifications_file =~ s/NF-YC;-/NFY;NF-YC/
}
if ($subfamily_classifications_file =~ /C1HDZ/ ) {
$subfamily_classifications_file =~ s/C1HDZ;-/HDZ;C1HDZ/
}
if ($subfamily_classifications_file =~ /C2HDZ/ ) {
$subfamily_classifications_file =~ s/C2HDZ;-/HDZ;C2HDZ/
}
if ($subfamily_classifications_file =~ /C3HDZ/ ) {
$subfamily_classifications_file =~ s/C3HDZ;-/HDZ;C3HDZ/
}
if ($subfamily_classifications_file =~ /C4HDZ/ ) {
$subfamily_classifications_file =~ s/C4HDZ;-/HDZ;C4HDZ/
}
if ($subfamily_classifications_file =~ /MYST/ ) {
$subfamily_classifications_file =~ s/MYST;-/HAT;MYST/
}
if ($subfamily_classifications_file =~ /CBP/ ) {
$subfamily_classifications_file =~ s/CBP;-/HAT;CBP/
}
if ($subfamily_classifications_file =~ /TAFII250/ ) {
$subfamily_classifications_file =~ s/TAFII250;-/HAT;TAFII250/
}
if ($subfamily_classifications_file =~ /GNAT/ ) {
$subfamily_classifications_file =~ s/GNAT;-/HAT;GNAT/
}
if ($subfamily_classifications_file =~ /LOB1/ ) {
$subfamily_classifications_file =~ s/LOB1;-/LBD;LOB1/
}
if ($subfamily_classifications_file =~ /LOB2/ ) {
$subfamily_classifications_file =~ s/LOB2;-/LBD;LOB2/
}
if ($subfamily_classifications_file =~ /MYB-related/ ) {
$subfamily_classifications_file =~ s/MYB-related;-/MYB;MYB-related/
}
if ($subfamily_classifications_file =~ /SWI\/SNF_SWI3/ ) {
$subfamily_classifications_file =~ s/SWI\/SNF_SWI3;-/MYB-related;SWI\/SNF_SWI3/
}
if ($subfamily_classifications_file =~ /MYB-2R/ ) {
$subfamily_classifications_file =~ s/MYB-2R;-/MYB;MYB-2R/
}
if ($subfamily_classifications_file =~ /MYB-3R/ ) {
$subfamily_classifications_file =~ s/MYB-3R;-/MYB;MYB-3R/
}
if ($subfamily_classifications_file =~ /MYB-4R/ ) {
$subfamily_classifications_file =~ s/MYB-4R;-/MYB;MYB-4R/
}
if ($subfamily_classifications_file =~ /RKD/ ) {
$subfamily_classifications_file =~ s/RKD;-/RWP-RK;RKD/
}
if ($subfamily_classifications_file =~ /NLP/ ) {
$subfamily_classifications_file =~ s/NLP;-/RWP-RK;NLP/
}
if ($subfamily_classifications_file =~ /AP2/ ) {
$subfamily_classifications_file =~ s/AP2;-/AP2;AP2/
}
if ($subfamily_classifications_file =~ /CRF/ ) {
$subfamily_classifications_file =~ s/CRF;-/AP2;CRF/
}
}
### 4.1 Filter for gene models
# Filter for gene models. All alternative splice variants will be removed from Arabidopsis and rice entries
shift @family_classifications_file;
shift @subfamily_classifications_file;
# Sort the array for filter and a sorted output
@family_classifications_file = sort @family_classifications_file;
@subfamily_classifications_file = sort @subfamily_classifications_file;
if ($gene_model_filter and $gene_model_filter eq "filter") {
print "*** filtering gene models: removing splice variants ***\n";
# temporary array for the filtered entries
my @filtered_entries = [];
my $models_removed_counter = 0;
my $prev_entry = "";
foreach my $entry_line (@family_classifications_file) {
if ($entry_line !~ /^([^.]+).\d+/) {
print "Bad format for filtering splice variants. Use \"filter\" only for TAIR (Aradopsis) or TIGR (Rice) files.\n\n";
exit;
}
if ($1 eq $prev_entry) {
$models_removed_counter++;
next;
}
$prev_entry = $1;
push @filtered_entries, $entry_line;
}
print " -> $models_removed_counter gene models were removed\n\n";
@family_classifications_file = @filtered_entries;
shift @family_classifications_file;
}
# Print the results in the array to output.1 and output.3 and push the names of the TAP families and Subfamilies into $family_list to create output.2
foreach my $fcf_line (@family_classifications_file) {
$fcf_line =~ /^[^;]+;([^;]+)/;
#push @family_list, "$1";
print FAMILY_CLASSIFICATIONS "$fcf_line";
}
close (FAMILY_CLASSIFICATIONS);
foreach my $fcf_line (@subfamily_classifications_file) {
$fcf_line =~ /^[^;]+;([^;]+);([^;]+)/;
if ($2 eq "-") {
push @family_list, "$1";
}
else {
push @family_list, "$2";
}
print SUBFAMILY_CLASSIFICATIONS "$fcf_line";
}
close (SUBFAMILY_CLASSIFICATIONS);
print "*** calculating the family statistics and write it in $family_statistics ***\n\n";
##################################
### 5. Create the output files ###
##################################
my $statistics_outputfile = "$family_statistics";
unless (open(FAMILY_STATISTICS, ">$statistics_outputfile")) {
print "Cannot open file \"$statistics_outputfile\" to write to!!\n\n";
exit;
}
# Count the family entries
my @output_family_statistics = ();
my @gefundene_familien = ();
my $family_counter = 1;
shift @family_list;
@family_list = sort @family_list;
my $old_family = "";
push @family_list, 'BAD FIX'; # Makes to loop go through every fam and stops at non fam(BAD FIX).
foreach my $line (@family_list) {
if ($line eq $old_family) {
$family_counter++;
}
elsif ($old_family ne "") {
push (@output_family_statistics,"$old_family;$family_counter\n");
# Add all found families to a list of found families
push (@gefundene_familien,"$old_family");
$family_counter=1;
}
$old_family = $line;
}
my %hash = ();
# Put all families from the classifictaion ruled and all found families in a hash
foreach my $element (@gefundene_familien,@liste_alle_familien) {$hash{$element}++;}
# Remove merged families
delete $hash{'GARP_ARR-B_Myb'};
delete $hash{'GARP_ARR-B_G2'};
delete $hash{'bZIP1'};
delete $hash{'bZIP2'};
delete $hash{'bZIPAUREO'};
delete $hash{'bZIPCDD'};
delete $hash{'HRT'};
delete $hash{'GIY_YIG'};
# Add all not found families from the classification rules to @output_family_statistics
# With zero as number of families found
foreach my $element (keys %hash) {
if (($hash{$element} == 1) and ( $element eq "0_no_family_found")) {
push (@output_family_statistics,"$element;$unclassified_families\n");
}
if (($hash{$element} == 1) and ($element ne "0_no_family_found")) {
push (@output_family_statistics,"$element;0\n");
}
}
# Sort @output_family_statistics caseinsensitive-alphabetically
my @sortierte_statistik = sort {lc $a cmp lc $b} @output_family_statistics;
# Print headline to @sortierte_statistik
unshift (@sortierte_statistik,"family statistics for $hmmsearch_output\n");
print FAMILY_STATISTICS @sortierte_statistik;
# Print FAMILY_STATISTICS "$old_family;$family_counter\n";
close (FAMILY_STATISTICS);
#################################################
### 6. Give out some statistical informations ###
#################################################
$entry_counter [0] = 0;
my $sum = 0;
foreach my $entry (@entry_counter) {
$sum += $entry;
}
print "$classified_families classifications were found for $sum proteins.\n";
print "This classifications are divided in:\n";
my $count = 0;
foreach my $element (@entry_counter) {
if ($count != 0) {
print "$element proteins were classified for $count";
if ($count == 1) {print " family\n";}
else {print " different families\n";}
}
$count++;
}
print "\n$unclassified_families proteins could not be classified\n\n";
print "*** The results were written in $family_classifications and $subfamily_classifications ***\n";
print "*** done ***\n\n";
exit;
sub get_file_data {
my ($filename) = @_;
use strict;
use warnings;
my @filedata = ();
unless( open(GET_FILE_DATA, $filename)) {
print STDERR "Cannot open file \"$filename\"n\n";
exit;
}
@filedata = <GET_FILE_DATA>;
close GET_FILE_DATA;
return @filedata;
}