/* * * * * * * * * * * * * * * * * * * * * * * * */ /* Analysis differiantally expressed transcripts */ /* between solid tumors and normal tissue */ /* samples in Anduril casestudy. */ /* */ /* Erkka Valo, Erkka.Valo@Helsinki.FI */ /* Marko Laakso, Marko.Laakso@Helsinki.FI */ /* Kristian Ovaska, Kristian.Ovaska@Helsinki.FI */ /* Riku Louhimo, Riku.Louhimo@Helsinki.FI */ /* */ /* * * * * * * * * * * * * * * * * * * * * * * * */ function MedianExonExpressionAnalysis(LogMatrix tExprMatrix, float fcThreshold = 2.0, float pValueThreshold = 0.05) -> (Latex report, CSV det, Matrix exprIndicator, CSV survival, Latex kaplanMeier, CSV kaplanMeierStatistics, CSV allProbes) { /** List of samples without clinical annotation. */ samplesWithoutClinicalAnnotation = INPUT(path="csv/samplesWithoutClinicalAnnotation.csv") /** SQL query to compile the differentially expressed transcript table. */ deeTableQuery = INPUT(path="sql/deeTableQuery.sql") /** Intersection transformation of differentially expressed transcripts selected by fold change and differentialyy expressed transcripts selected by t-test. */ intersectionTransformation = INPUT(path="csv/FoldChangeAndTTest.csv") /** R Script for filtering out samples without clinical annotation. */ filterSamplesRScript = INPUT(path="r/filterSamples.R") /** Patient metadata for survival analysis. */ metadata = INPUT(path=patientMetadataFileName) /* Whether to filter rows from the expression matrix where variation is below threshold. */ varFilterRows = false /* Threshold for filtering low variation rows. */ varFilterThreshold = 0.05 // Filter rows with low variance if (varFilterRows) { tExprFiltered = VariationFilter(matrix=tExprMatrix, threshold=varFilterThreshold) } // Filter samples without clinical annotation tExpr = REvaluate(table1 = tExprMatrix, table2 = samplesWithoutClinicalAnnotation, script = filterSamplesRScript) // Generate sample groups of tumor mean and normal mean. sampleGroups = SampleGroupCreator(tExpr.table, pattern1="TCGA_[0-9]{2}_[0-9]{4}_01", definition1="tumor_solid,mean,Solid tumor samples", pattern2="TCGA_[0-9]{2}_[0-9]{4}_11", definition2="normal_tissue,mean,Normal tissue samples", pattern3="tumor_solid,normal_tissue", definition3="tumor_vs_normal,ratio,fold change of tumor median vs. normal median", patternTypes="re,re,verbatim") // Calculate group means groupMeans = SampleCombiner(force expr = tExpr.table, groups = sampleGroups.groups, geometricMean = true, groupIDs = "tumor_solid,normal_tissue") // Calculate logratio for tumor mean vs normal mean logratio = SampleCombiner(expr = groupMeans.expr, force groups = sampleGroups.groups, groupIDs = "tumor_vs_normal") // Calculate differentially expressed transcripts based on fold change. detFoldChange = FoldChange(logratio = logratio, threshold = fcThreshold) // Calculate differentially expressed transcripts based on t-test. detTTest = StatisticalTest(force matrix = tExpr.table, force groups = sampleGroups.groups, correction = "fdr", targetColumns = "tumor_solid", referenceColumns = "normal_tissue", outputSet = "tumor_vs_normal_tTest", threshold = pValueThreshold) // Take differentially expressed transcripts based on fold change and t-test. det = SetTransformer(transformation = intersectionTransformation, set1 = detFoldChange.deg, set2 = detTTest.idlist, includeOriginal = true) detIDList = CSV2IDList(det, regexp1 = "ID=over|under", columnIn = "Members", columnOut = "GeneID", isList = true) // Fetch annotations for differentially expressed transcripts detAnnot = KorvasieniAnnotator(sourceKeys = detIDList, connection = ensembl, inputDB = ".GeneId", inputType = "Gene", targetDB = ".GeneId,.GeneName,.GeneDesc,GO") // Compile a table of differentially expressed transcripts detTable = TableQuery(table1 = detIDList, table2 = detTTest.pvalues, table3 = logratio, table4 = detAnnot.bioAnnotation, query = deeTableQuery) // GO Clustering of the DETs goClusters = GOClustering(goAnnotations = detAnnot, force expr = tExpr.table, columnMargin = 0, dropUnknown = true, grayScale = false, asBitmap = true, heatmapScale = "row") // Report of differentially expressed transcripts. detReport = DEGReport(deg = det, force geneAnnotation = logratio, geneColumn = "GeneID", setPattern = "over|under", sectionTitle = "Summary of differentially expressed genes: fc > "+ fcThreshold+" and corrected p-value < "+pValueThreshold) detVennDiagram = VennDiagram(det, sets1 = "tumor_vs_normal_tTest,tumor_vs_normal_fcOver,tumor_vs_normal_fcUnder", names1 = "T-test,over,under", cexSetName = 0.4, cexSetSize = 0.4) // Quality control // Cumulative distribution for the samples cumDistReport = Plot2D(y = tExpr.table, caption = "Distribution of transcript expression values (log2) for the samples.", sectionTitle = "Ordered transcript expression values", legendPosition = "off", xLabel = "transcripts", yLabel = "intensity", title = "", sort = true, plotType = "l", imageType = "single", height = 11) // MDS plot of samples mdsReport = MDSPlot(force expr = tExpr.table, force groups = sampleGroups.groups, plotNames = true, sectionTitle = "Multidimensional scaling of the samples", cex = 0.2, legendPosition = "topright") // Compile quality control report qcReport = LatexCombiner(cumDistReport.plot, mdsReport.report) /** Survival analysis based on transcript expression. */ km = TranscriptSurvival(sampleGroups.groups, metadata, force tExpr.table, detIDList, detAnnot.bioAnnotation, fcThreshold = fcThreshold, annotCol = ".GeneName,.GeneDesc", title = "Survival analysis for gene expression") // SPIA survGenes = CSVTransformer(km.statistics, transform1="csv1[csv1$pValue < 0.01, 'group']") survGenesEntrez = KorvasieniAnnotator(survGenes, connection=ensembl, inputDB=".GeneId", targetDB="_EntrezGene", maxHits=1) survLogratio = CSVFilter(logratio, survGenes) survLogratioEntrez = IDConvert(survLogratio, survGenesEntrez, conversionColumn="EntrezGene", originalWhenMissing=false, unique=true) survLogratioEntrezFilt = CSVFilter(survLogratioEntrez, nonMissing=2) allEntrez = KorvasieniAnnotator(logratio, connection=ensembl, inputDB=".GeneId", targetDB="_EntrezGene", maxHits=1) reference = CSV2IDList(allEntrez, columnIn="EntrezGene", @execute="once") spia = SPIA(deg=survLogratioEntrezFilt, reference=reference) exonSPIArefs = INPUT(path="csv/exonSPIArefs.csv") spiaReport = CSV2Latex(spia.pathways, refs = exonSPIArefs, columns = "Name,ID,pSize,NDE,tA,pNDE,pPERT,pG,pGFdr,pGFWER,Status", ruler = "{}", numberFormat = "tA=#0.0000,pNDE=#0.0000,pG=#0.0000,pGFdr=#0.0000,pGFWER=#0.0000", section = "Signalling pathway impact analysis for differentially expressed genes "+ "evaluated from median exon expression with survival p $<$ 0.01 ", sectionType = "section", pageBreak = true, caption = "SPIA analysis ~\\cite{Tarca2008}. Name is the KEGG name for the pathway, and "+ "ID is the KEGG internal identication for the pathway. "+ "pSize is total number of genes in the pathway according to KEGG, where as "+ "NDE is the number of diffentially expressed genes on the pathway. pNDE is the "+ "p-value to observe NDE number of genes on the pathway by chance. tA is a "+ "perturbation factor defined by SPIA algorithm and pPERT is the probability of "+ "observing a more intense perturbation than tA by chance. pG is the combination "+ "of pPERT and pNDE. pGFdr is the false discovery rate and pGFWER the "+ "Bonferoni corrected p-value.") // PDF report report = LatexCombiner(detReport.summary, detVennDiagram.report, qcReport.document, goClusters.report, km.report, spiaReport.report) /** Transcript result data processing. */ transcriptStatisticsQuery = INPUT(path="sql/MedianExonStatistics.sql") allProbes = TableQuery(logratio.expr, detTTest.pvalues, query = transcriptStatisticsQuery) return record(report = report.document, det = detTable.table, exprIndicator = km.sampleExpressionIndicator, survival = km.survivalTable, kaplanMeier = km.report, kaplanMeierStatistics = km.statistics, allProbes = allProbes.table) }