| 1 | # Copyright (c) 2010 - 2011 European Molecular Biology Laboratory |
|---|
| 2 | # |
|---|
| 3 | # Licensed under the Apache License, Version 2.0 (the "License"); |
|---|
| 4 | # you may not use this file except in compliance with the License. |
|---|
| 5 | # You may obtain a copy of the License at |
|---|
| 6 | # |
|---|
| 7 | # http://www.apache.org/licenses/LICENSE-2.0 |
|---|
| 8 | # |
|---|
| 9 | # Unless required by applicable law or agreed to in writing, software |
|---|
| 10 | # distributed under the License is distributed on an "AS IS" BASIS, |
|---|
| 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
|---|
| 12 | # See the License for the specific language governing permissions and |
|---|
| 13 | # limitations under the License. |
|---|
| 14 | |
|---|
| 15 | ########################################################## |
|---|
| 16 | # Gene enrichment test by using ontoCAT R package |
|---|
| 17 | # Please use the full version of ontoCAT R package: https://sourceforge.net/projects/ontocat/files/ontoCAT/ontoCAT_R/ontoCAT_1.2.1.tar.gz |
|---|
| 18 | ########################################################## |
|---|
| 19 | |
|---|
| 20 | #Java Heap size needed to reason over GO ontology (more than 20 MB in size) is 512MB. |
|---|
| 21 | #Here are the instructions how to increase Java Heap Size in R: |
|---|
| 22 | |
|---|
| 23 | library(rJava) |
|---|
| 24 | options(java.parameters="-Xmx512") |
|---|
| 25 | .jinit() |
|---|
| 26 | |
|---|
| 27 | #To check the result: |
|---|
| 28 | .jcall(.jnew("java/lang/Runtime"), "J", "maxMemory") |
|---|
| 29 | #Now it is possible to work with large ontologies like GO |
|---|
| 30 | |
|---|
| 31 | library(RCurl) |
|---|
| 32 | library(rjson) |
|---|
| 33 | |
|---|
| 34 | # Define a function to retrieve some genes from Gene Expression Atlas (http://www.ebi.ac.uk/gxa/) |
|---|
| 35 | options(GXAAPIURL="http://www.ebi.ac.uk/gxa/api?") |
|---|
| 36 | queryGeneUp <- function(query) fromJSON(getURL(paste(getOption("GXAAPIURL"), "upIn=", query, sep = ""))) |
|---|
| 37 | |
|---|
| 38 | # Create a list of over-expressed genes in pancreas vs other tissues |
|---|
| 39 | g <- queryGeneUp("pancreas") |
|---|
| 40 | |
|---|
| 41 | # g$results is a list of 100 genes, each accessible by g$results[[i]]$gene |
|---|
| 42 | names(g$results[[10]]$gene) |
|---|
| 43 | |
|---|
| 44 | # Get GO ontology |
|---|
| 45 | library(ontoCAT) |
|---|
| 46 | |
|---|
| 47 | # Obtain GO ontology |
|---|
| 48 | go <- getOntology("http://www.geneontology.org/ontology/obo_format_1_2/gene_ontology_ext.obo") |
|---|
| 49 | |
|---|
| 50 | # Obtain the list of all GOIDs using biomaRt |
|---|
| 51 | library(biomaRt) |
|---|
| 52 | mart <- useMart("ensembl") |
|---|
| 53 | mart<-useDataset("hsapiens_gene_ensembl",mart) |
|---|
| 54 | genes <- getBM(attributes=c("ensembl_gene_id","go_biological_process_id"), mart=mart) |
|---|
| 55 | all.GOIDs <- unique(genes[,2]) |
|---|
| 56 | all.GOIDs <- all.GOIDs[2:length(all.GOIDs)] |
|---|
| 57 | |
|---|
| 58 | # Create a list of over-expressed genes |
|---|
| 59 | genes.of.interest <- c(sapply(1:100,function(x) g$results[[x]]$gene$ensemblGeneId),recursive = TRUE) |
|---|
| 60 | selection <- genes[which(genes[,1] %in% genes.of.interest),] |
|---|
| 61 | |
|---|
| 62 | # Create a list of genes for each GO ID |
|---|
| 63 | genes.by.GOID <- sapply(1:length(all.GOIDs),function(x) genes[which(genes[,2] == all.GOIDs[x]),][1]$ensembl_gene_id) |
|---|
| 64 | names(genes.by.GOID)=all.GOIDs |
|---|
| 65 | |
|---|
| 66 | # Create a list containing the number of genes in each category |
|---|
| 67 | size.by.GOID<-lapply(genes.by.GOID,length) |
|---|
| 68 | |
|---|
| 69 | # Construct a list containing the overlap for each GO category with the genes of interest |
|---|
| 70 | #by using ontoCAT library to get all children (subclasses) of particular GO term |
|---|
| 71 | overlaps = lapply(names(genes.by.GOID), function(x) length(which(selection[,2] %in% |
|---|
| 72 | gsub("GO_*", "GO:", c(lapply(getTermAndAllChildrenById(go,toString(gsub("GO:*", "GO_",x))),function(t) {getAccession(t)}),recursive = TRUE)) |
|---|
| 73 | ))) |
|---|
| 74 | |
|---|
| 75 | # Create a list of hypergeometric p-values |
|---|
| 76 | pvals = as.array(sapply(1:length(genes.by.GOID), function(x) |
|---|
| 77 | phyper(overlaps[[x]]-1, |
|---|
| 78 | size.by.GOID[[x]], |
|---|
| 79 | length(all.GOIDs) - size.by.GOID[[x]], |
|---|
| 80 | length(unique(selection[,1])), |
|---|
| 81 | lower.tail=FALSE))) |
|---|
| 82 | names(pvals) = names(genes.by.GOID) |
|---|
| 83 | |
|---|
| 84 | # Get the most enriched categories: |
|---|
| 85 | pvals[order(pvals)][1:10] |
|---|
| 86 | |
|---|
| 87 | |
|---|
| 88 | |
|---|
| 89 | |
|---|