// Andril Case Study - EXPRESSION ANALYSIS FOR GENES // author: Marko Laakso function GeneExpressionAnalysis(LogMatrix normalizedData, float fcLimit = 2.0, float pLimit = 0.05, string arrayType = "affy_hg_u133a", boolean joinProbes = true) -> (Latex report, CSV degs, CSV allProbes, Properties biomart) { /** List of samples without clinical annotation. */ samplesWithoutClinicalAnnotation = INPUT(path="csv/samplesWithoutClinicalAnnotation.csv") /** R Script for filtering out samples without clinical annotation. */ filterSamplesRScript = INPUT(path="r/filterSamples.R") // Filter samples without clinical annotation sampleFilter = REvaluate(table1 = normalizedData, table2 = samplesWithoutClinicalAnnotation, script = filterSamplesRScript) probeAnnot = BiomartAnnotator(filter = normalizedData, filterTypes = arrayType, attributes = "ensembl_gene_id", batchSize = 1000) /** Skip those probes that do not match genes or a mapped to multiple genes. */ uniqProbesSQL = INPUT(path="sql/UniqProbes.sql") probeAnnotExp = ExpandCollapse(probeAnnot.annotations, listCols="ensembl_gene_id") probeSelect = TableQuery(query=uniqProbesSQL, table1=probeAnnotExp) probeFilter = CSVFilter(force sampleFilter.table, probeSelect, matchColumn="probe") /* Threshold for filtering low variation rows. */ varFilterThreshold = 0.1 if (joinProbes) { probeJoin = IDConvert(probeFilter, probeSelect, conversionColumn=".GeneId", unique =true) varFilter = VariationFilter(matrix=probeJoin.csv, threshold=varFilterThreshold) } else { varFilter = VariationFilter(matrix=probeFilter, threshold=varFilterThreshold) } sampleGroups = SampleGroupCreator(varFilter, pattern1 = "TCGA_[0-9]{2}_[0-9]{4}_01", definition1 = "tumor_solid,mean,Solid tumor samples", pattern2 = "TCGA_[0-9]{2}_[0-9]{4}_11", definition2 = "normal_tissue,mean,Normal tissue samples", pattern3 = "tumor_solid,normal_tissue", definition3 = "tumor_vs_normal,ratio,fold change of tumor median vs. normal median", patternTypes = "re,re,verbatim") groupMeans = SampleCombiner(expr = varFilter, groups = sampleGroups.groups, groupIDs = "tumor_solid,normal_tissue") logratio = SampleCombiner(expr = groupMeans.expr, groups = sampleGroups.groups, groupIDs = "tumor_vs_normal") // Calculate differentially expressed genes based on fold change. foldChange = FoldChange(logratio = logratio, threshold = fcLimit) // Calculate differentially expressed genes based on t-test. tTest = StatisticalTest(matrix = varFilter, groups = sampleGroups.groups, correction = "fdr", targetColumns = "tumor_solid", referenceColumns = "normal_tissue", outputSet = "tumor_vs_normal_tTest", threshold = pLimit) /** Selection of differentially expressed genes by their fold change and t-test calls. */ degTransformation = INPUT(path="csv/FoldChangeAndTTest.csv") // Take differentially expressed transcripts based on fold change and t-test. degSet = SetTransformer(transformation = degTransformation, set1 = foldChange.deg, set2 = tTest.idlist) degProbes = CSV2IDList(degSet, regexp1="ID=over|under", columnIn="Members,Members", columnOut=".GeneId", isList=true) degAnnotK = KorvasieniAnnotator(sourceKeys = degProbes, // degAnnotPre connection = ensembl, inputDB = ".GeneId", targetDB = ".GeneName,.GeneDesc,GO,_Uniprot/SWISSPROT", echoColumns = ".GeneId", inputType = "Gene") /** Switch the order of annotation columns. */ degAnnotCols = INPUT(path="r/DEGAnnotColumns.r") degAnnot = REvaluate(script=degAnnotCols, table1=degAnnotK) /** This query combines DEGs to their fold changes and p-values. */ degStatSQL = INPUT(path="sql/DEGStatistics.sql") degStat = TableQuery(table1 = degAnnot.table, table2 = logratio, table3 = tTest.pvalues, query = degStatSQL) degReport = DEGReport(deg = degSet, force geneAnnotation = degAnnot.table, expr = logratio, geneColumn = ".GeneName", sectionTitle = "Summary of differentially expressed genes", sectionType = "subsection", columns = 8) heatmap = GOClustering(force goAnnotations = degAnnot.table, expr = varFilter, columnMargin = 0, dropUnknown = true, grayScale = true, asBitmap = true, heatmapScale = "row") survivalQuery = INPUT(path="sql/exonSurvival.sql") metadata = INPUT(path=patientMetadataFileName) survival = TranscriptSurvival(sampleGroups.groups, metadata, varFilter, force detAnnotation = degAnnot.table, fcThreshold = fcLimit, title = "Survival analysis for DEGs", annotCol = ".GeneName,.GeneDesc,Uniprot/SWISSPROT") report = LatexCombiner(degReport.summary, degReport.genelist, heatmap.report, survival.report, sectionTitle = "Gene expression analysis") /* Compute result table for all probes. */ allMeans = SampleCombiner(expr = probeFilter, groups = sampleGroups.groups, groupIDs = "tumor_solid,normal_tissue") allLogratio = SampleCombiner(expr = allMeans.expr, groups = sampleGroups.groups, groupIDs = "tumor_vs_normal") geneStatisticsQuery = INPUT(path="sql/GeneStatistics.sql") allProbesDup = TableQuery(probeSelect.table, allLogratio.expr, tTest.pvalues, query = geneStatisticsQuery, numIndices = 2) allProbes = IDConvert(allProbesDup.table, allProbesDup.table, sourceColumn="GeneID", conversionColumn="GeneID", unique=true) return record(report = report, degs = degStat, allProbes = allProbes.csv, biomart = probeAnnot.databases) }