mv scripts to scripts dir, generate results in results dir

panyq357 · Jan 11, 2024 · dd65531 · dd65531
1 parent c773d6a
commit dd65531
Show file tree

Hide file tree

Showing 7 changed files with 541 additions and 455 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__
+results
diff --git a/README.md b/README.md
@@ -1,3 +1,4 @@
+Some scripts to prepare ontology long table for enrichment and GSEA analysis.
 
 ## Requirements
 
@@ -10,34 +11,42 @@ python3 -m pip install pandas owlready2 retrying openpyxl
 - [Oryzabase annotation](https://shigen.nig.ac.jp/rice/oryzabase/download/gene)
 
 ```bash
-python3 get_ontologies_from_oryzabase.py
+python3 scripts/get_ontologies_from_oryzabase.py
 ```
 
 ## Clean JGI Sitalica annotation info
 
 - [JGI Sitalica annotation](https://data.jgi.doe.gov/refine-download/phytozome?organism=Sitalica)
 
 ```bash
-python3 clean_jgi_si_annotation.py
+python3 scripts/clean_jgi_si_annotation.py
 ```
 
 ## Enrichment Analysis using clusterProfiler
 
 ```r
-onto = readxl::read_excel("oryzabase-ontologies-2023-05-27.xlsx", sheet="RAP_GO")
+install.packages("readxl")
+install.packages("BiocManager")
+BiocManager::install("clusterProfiler")
+```
+
+```r
+onto = readxl::read_excel("results/oryzabase-ontologies.xlsx", sheet="RAP_GO")
 
-gene_list = c("Os01g0118100", "Os01g0549700", "Os02g0710800", "Os03g0108600", "Os03g0158200", "Os03g0746500")
-universe = unique(onto[["GeneID"]])
+gene = c("Os01g0118100", "Os01g0549700", "Os02g0710800", "Os03g0108600", "Os03g0158200", "Os03g0746500")
+universe = NULL
 
-enrich_result = clusterProfiler::enricher(
-    gene=gene_list,
+enrich_res = clusterProfiler::enricher(
+    gene=gene,
     universe=universe,
     TERM2GENE=onto[c("OntoID", "GeneID")],
     TERM2NAME=onto[c("OntoID", "Description")]
 )
 
+write.csv(as.data.frame(enrich_res), "enrich_res.csv")
+
 svg("demo_dotplot.svg")
-clusterProfiler::dotplot(enrich_result)
+clusterProfiler::dotplot(enrich_res)
 dev.off()
 ```
 

diff --git a/demo_dotplot.svg b/demo_dotplot.svg
diff --git a/enrich_res.csv b/enrich_res.csv
@@ -0,0 +1,39 @@
+"","ID","Description","GeneRatio","BgRatio","pvalue","p.adjust","qvalue","geneID","Count"
+"GO:0140098","GO:0140098","catalytic activity, acting on RNA","6/6","102/12264",2.85294211795174e-13,1.79010754261359e-11,8.25513252734087e-12,"Os01g0118100/Os01g0549700/Os02g0710800/Os03g0108600/Os03g0158200/Os03g0746500",6
+"GO:0004386","GO:0004386","helicase activity","6/6","105/12264",3.40972865259732e-13,1.79010754261359e-11,8.25513252734087e-12,"Os01g0118100/Os01g0549700/Os02g0710800/Os03g0108600/Os03g0158200/Os03g0746500",6
+"GO:0140657","GO:0140657","ATP-dependent activity","6/6","278/12264",1.28652420390045e-10,3.68222180984431e-09,1.69806720052971e-09,"Os01g0118100/Os01g0549700/Os02g0710800/Os03g0108600/Os03g0158200/Os03g0746500",6
+"GO:0140640","GO:0140640","catalytic activity, acting on a nucleic acid","6/6","282/12264",1.40275116565497e-10,3.68222180984431e-09,1.69806720052971e-09,"Os01g0118100/Os01g0549700/Os02g0710800/Os03g0108600/Os03g0158200/Os03g0746500",6
+"GO:0032392","GO:0032392","DNA geometric change","2/6","10/12264",8.96084869265617e-06,0.000156814852121483,7.23156210284533e-05,"Os01g0549700/Os03g0746500",2
+"GO:0032508","GO:0032508","DNA duplex unwinding","2/6","10/12264",8.96084869265617e-06,0.000156814852121483,7.23156210284533e-05,"Os01g0549700/Os03g0746500",2
+"GO:0003723","GO:0003723","RNA binding","4/6","380/12264",1.29558278470181e-05,0.000194337417705272,8.9619260295163e-05,"Os01g0118100/Os01g0549700/Os03g0108600/Os03g0158200",4
+"GO:0071103","GO:0071103","DNA conformation change","2/6","14/12264",1.81050641762374e-05,0.000237628967313116,0.000109583283171963,"Os01g0549700/Os03g0746500",2
+"GO:0051028","GO:0051028","mRNA transport","2/6","15/12264",2.08859136807111e-05,0.00024366899294163,0.000112368658399148,"Os01g0549700/Os03g0158200",2
+"GO:0050657","GO:0050657","nucleic acid transport","2/6","18/12264",3.04138991569912e-05,0.000266121617623673,0.00012272275098435,"Os01g0549700/Os03g0158200",2
+"GO:0050658","GO:0050658","RNA transport","2/6","18/12264",3.04138991569912e-05,0.000266121617623673,0.00012272275098435,"Os01g0549700/Os03g0158200",2
+"GO:0051236","GO:0051236","establishment of RNA localization","2/6","18/12264",3.04138991569912e-05,0.000266121617623673,0.00012272275098435,"Os01g0549700/Os03g0158200",2
+"GO:0015931","GO:0015931","nucleobase-containing compound transport","2/6","22/12264",4.58790687376069e-05,0.000370561709034517,0.000170885600156268,"Os01g0549700/Os03g0158200",2
+"GO:0051276","GO:0051276","chromosome organization","2/6","76/12264",0.000559420900666902,0.00419565675500176,0.0019348392053141,"Os01g0549700/Os03g0746500",2
+"GO:0006996","GO:0006996","organelle organization","3/6","465/12264",0.000994289150616882,0.00696002405431818,0.00320963515286853,"Os01g0549700/Os03g0108600/Os03g0746500",3
+"GO:0006406","GO:0006406","mRNA export from nucleus","1/6","10/12264",0.00488339925547487,0.0313973576819446,0.0144789819886662,"Os03g0158200",1
+"GO:0006396","GO:0006396","RNA processing","2/6","232/12264",0.00508338171993389,0.0313973576819446,0.0144789819886662,"Os01g0549700/Os03g0108600",2
+"GO:0071705","GO:0071705","nitrogen compound transport","2/6","250/12264",0.00588136527864102,0.0333088228559844,0.0153604596629101,"Os01g0549700/Os03g0158200",2
+"GO:0005635","GO:0005635","nuclear envelope","1/6","13/12264",0.00634453768685417,0.0333088228559844,0.0153604596629101,"Os03g0158200",1
+"GO:0006405","GO:0006405","RNA export from nucleus","1/6","13/12264",0.00634453768685417,0.0333088228559844,0.0153604596629101,"Os03g0158200",1
+"GO:0010286","GO:0010286","heat acclimation","1/6","15/12264",0.00731763638131266,0.0365881819065633,0.0168727455408713,"Os03g0158200",1
+"GO:0051168","GO:0051168","nuclear export","1/6","17/12264",0.00828994080678425,0.037845381944015,0.017452506961651,"Os03g0158200",1
+"GO:1901701","GO:1901701","cellular response to oxygen-containing compound","1/6","17/12264",0.00828994080678425,0.037845381944015,0.017452506961651,"Os03g0108600",1
+"GO:0003678","GO:0003678","DNA helicase activity","1/6","21/12264",0.0102321689254229,0.0429751094867761,0.0198180956029243,"Os03g0746500",1
+"GO:0006401","GO:0006401","RNA catabolic process","1/6","21/12264",0.0102321689254229,0.0429751094867761,0.0198180956029243,"Os03g0746500",1
+"GO:0016226","GO:0016226","iron-sulfur cluster assembly","1/6","24/12264",0.0116867589144992,0.0454485068897192,0.0209587099441312,"Os03g0108600",1
+"GO:0031163","GO:0031163","metallo-sulfur cluster assembly","1/6","24/12264",0.0116867589144992,0.0454485068897192,0.0209587099441312,"Os03g0108600",1
+"GO:0046474","GO:0046474","glycerophospholipid biosynthetic process","1/6","29/12264",0.0141071167418454,0.0481907213909368,0.0222232900649934,"Os03g0108600",1
+"GO:0009644","GO:0009644","response to high light intensity","1/6","30/12264",0.0145905950662876,0.0481907213909368,0.0222232900649934,"Os03g0158200",1
+"GO:0019288","GO:0019288","isopentenyl diphosphate biosynthetic process, methylerythritol 4-phosphate pathway","1/6","30/12264",0.0145905950662876,0.0481907213909368,0.0222232900649934,"Os03g0108600",1
+"GO:0071702","GO:0071702","organic substance transport","2/6","403/12264",0.0147999329235967,0.0481907213909368,0.0222232900649934,"Os01g0549700/Os03g0158200",2
+"GO:0006913","GO:0006913","nucleocytoplasmic transport","1/6","33/12264",0.016039844720983,0.0481907213909368,0.0222232900649934,"Os03g0158200",1
+"GO:0019682","GO:0019682","glyceraldehyde-3-phosphate metabolic process","1/6","33/12264",0.016039844720983,0.0481907213909368,0.0222232900649934,"Os03g0108600",1
+"GO:0051169","GO:0051169","nuclear transport","1/6","33/12264",0.016039844720983,0.0481907213909368,0.0222232900649934,"Os03g0158200",1
+"GO:0009240","GO:0009240","isopentenyl diphosphate biosynthetic process","1/6","34/12264",0.0165225330483212,0.0481907213909368,0.0222232900649934,"Os03g0108600",1
+"GO:0046490","GO:0046490","isopentenyl diphosphate metabolic process","1/6","34/12264",0.0165225330483212,0.0481907213909368,0.0222232900649934,"Os03g0108600",1
+"GO:0007275","GO:0007275","multicellular organism development","2/6","443/12264",0.0177305740627191,0.0496523288805762,0.0228973145714938,"Os03g0108600/Os03g0746500",2
+"GO:0045017","GO:0045017","glycerolipid biosynthetic process","1/6","37/12264",0.0179694142615419,0.0496523288805762,0.0228973145714938,"Os03g0108600",1
diff --git a/clean_jgi_si_annotation.py → scripts/clean_jgi_si_annotation.py b/clean_jgi_si_annotation.py → scripts/clean_jgi_si_annotation.py
@@ -1,19 +1,21 @@
 #!/usr/bin/env python3
 # -*- coding: utf-8 -*-
-# Date: 2023-07-10
-# Author: panyq
-# Description: Make GO, PO, TO annotation long table from oryzabase downloadable file.
+# Date: 2024-01-11
+# Author: panyq357
+# Description: Make GO annotation long table from JGI annotation info file.
+
+from pathlib import Path
 
 import pandas as pd
 
 from onto_wrapper import Onto
 
 config = {
-    "jgi_si_annotation": "~/Downloads/Sitalica_312_v2.2.annotation_info.txt",
+    "jgi_si_annotation": "/mnt/d/PublicData/JGI/Setaria_italica_v2.2/rawdata/Sitalica_312_v2.2.annotation_info.txt",
     "go_owl": "http://purl.obolibrary.org/obo/go.owl",
     "gene_id_regex": r"Seita.[1-9]G\d{6}",
     "go_id_regex": r"GO:\d{7}",
-    "out_path": "jgi-si-go-v2.2.xlsx"
+    "out_path": "results/jgi-si-go-v2.2.xlsx"
 }
 
 def main():
@@ -32,8 +34,14 @@ def main():
 
     df["Category"] = df["OntoID"].map(go_onto.get_go_category)
 
-    print(f"Writing to {config['out_path']} ...")
-    df.to_excel(config['out_path'], index=False)
+    out_path = Path(config['out_path'])
+
+    print(f"Writing to {str(out_path)} ...")
+
+    if not out_path.parent.exists():
+        out_path.parent.mkdir(parents=True)
+
+    df.to_excel(out_path, index=False)
     print("Done")
 
 

diff --git a/get_ontologies_from_oryzabase.py → scripts/get_ontologies_from_oryzabase.py b/get_ontologies_from_oryzabase.py → scripts/get_ontologies_from_oryzabase.py
@@ -5,6 +5,7 @@
 # Description: Make GO, PO, TO annotation long table from oryzabase downloadable file.
 
 import datetime
+from pathlib import Path
 import re
 
 import pandas as pd
@@ -34,9 +35,9 @@
         "RAP_TO": {"gene": "RAP ID", "onto": "Trait Ontology"},
         "MSU_GO": {"gene": "MSU ID", "onto": "Gene Ontology"},
         "MSU_PO": {"gene": "MSU ID", "onto": "Plant Ontology"},
-        "MSU_TO": {"gene": "MSU ID", "onto": "Trait Ontology"},
+        "MSU_TO": {"gene": "MSU ID", "onto": "Trait Ontology"}
     },
-    "out_path": f"oryzabase-ontologies-{str(datetime.date.today())}.xlsx"
+    "out_path": f"results/oryzabase-ontologies.xlsx"
 }
 
 def main():
@@ -73,8 +74,13 @@ def main():
         df_dict[table_name] = df
         print("Done")
 
-    print(f"Writing to {config['out_path']} ...")
-    with pd.ExcelWriter(config["out_path"]) as writer:
+    out_path = Path(config['out_path'])
+    print(f"Writing to {str(out_path)} ...")
+
+    if not out_path.parent.exists():
+        out_path.parent.mkdir(parents=True)
+
+    with pd.ExcelWriter(out_path) as writer:
         for df_name, df in df_dict.items():
             df.to_excel(writer, df_name, index=False)
     print("Done")

diff --git a/onto_wrapper.py → scripts/onto_wrapper.py b/onto_wrapper.py → scripts/onto_wrapper.py
@@ -162,6 +162,7 @@ def _id_ex(str_containing_id):
     def get_go_category(self, go_id):
         '''
         Determine which category a GO ID belongs to.
+        Only valid with "GO.owl".
         '''
 
         if self.has_ancestor(go_id, "GO:0008150"):