/* * * * * * * * * * * * * * * * * * * * * * * * */ /* Analysis differiantally expressed transcripts */ /* between solid tumors and normal tissue */ /* samples in Anduril casestudy. */ /* */ /* Erkka Valo, Erkka.Valo@Helsinki.FI */ /* Marko Laakso, Marko.Laakso@Helsinki.FI */ /* Kristian Ovaska, Kristian.Ovaska@Helsinki.FI */ /* */ /* * * * * * * * * * * * * * * * * * * * * * * * */ function TranscriptExpressionAnalysis(LogMatrix tExprMatrix, CSV tAnnot, float fcThreshold = 2.0, float pValueThreshold = 0.05) -> (Latex report, CSV det, Matrix exprIndicator, CSV survival, Latex kaplanMeier, CSV kaplanMeierStatistics, CSV allProbes) { /** List of samples without clinical annotation. */ samplesWithoutClinicalAnnotation = INPUT(path="csv/samplesWithoutClinicalAnnotation.csv") /** SQL query to compile the differentially expressed transcript table. */ detTableQuery = INPUT(path="sql/detTableQuery.sql") /** Intersection transformation of differentially expressed transcripts selected by fold change and differentialyy expressed transcripts selected by t-test. */ intersectionTransformation = INPUT(path="csv/FoldChangeAndTTest.csv") /** R Script for filtering out samples without clinical annotation. */ filterSamplesRScript = INPUT(path="r/filterSamples.R") // survivalQuery = INPUT(path="sql/exonSurvival.sql") /** Patient metadata for survival analysis. */ metadata = INPUT(path=patientMetadataFileName) /** Transcript expression annotations. */ transcriptAnnotation = INPUT(path=transcriptAnnotationFileName) /* Whether to filter rows from the expression matrix where variation is below threshold. */ varFilterRows = false /* Threshold for filtering low variation rows. */ varFilterThreshold = 0.05 // Filter rows with low variance if (varFilterRows) { tExprFiltered = VariationFilter(matrix=tExprMatrix, threshold=varFilterThreshold) } // Create expression matrix with transcripts as row names tExprTemp = IDConvert(csv = tExprMatrix, conversionTable = tAnnot, conversionColumn = "Transcript", keyColumn = "Index", sourceColumn = "Index", targetColumn = ".TranscriptId") // Filter samples without clinical annotation tExpr = REvaluate(table1 = tExprTemp, table2 = samplesWithoutClinicalAnnotation, script = filterSamplesRScript) // Generate sample groups of tumor mean and normal mean. sampleGroups = SampleGroupCreator(tExpr.table, pattern1 ="TCGA_[0-9]{2}_[0-9]{4}_01", definition1 ="tumor_solid,mean,Solid tumor samples", pattern2 ="TCGA_[0-9]{2}_[0-9]{4}_11", definition2 ="normal_tissue,mean,Normal tissue samples", pattern3 ="tumor_solid,normal_tissue", definition3 ="tumor_vs_normal,ratio,fold change of tumor median vs. normal median", patternTypes="re,re,verbatim") // Calculate group means groupMeans = SampleCombiner(force expr = tExpr.table, groups = sampleGroups.groups, geometricMean = true, groupIDs = "tumor_solid,normal_tissue") // Calculate logratio for tumor mean vs normal mean logratio = SampleCombiner(expr = groupMeans.expr, force groups = sampleGroups.groups, groupIDs = "tumor_vs_normal") // Calculate differentially expressed transcripts based on fold change. detFoldChange = FoldChange(logratio = logratio, threshold = fcThreshold) // Calculate differentially expressed transcripts based on t-test. detTTest = StatisticalTest(force matrix = tExpr.table, force groups = sampleGroups.groups, correction = "fdr", targetColumns = "tumor_solid", referenceColumns = "normal_tissue", outputSet = "tumor_vs_normal_tTest", threshold = pValueThreshold) // Take differentially expressed transcripts based on fold change and t-test. det = SetTransformer(transformation = intersectionTransformation, set1 = detFoldChange.deg, set2 = detTTest.idlist, includeOriginal = true) detIDList = CSV2IDList(det, regexp1 = "ID=over|under", columnIn = "Members", columnOut = ".TranscriptId", isList = true) // Fetch annotations for differentially expressed transcripts detAnnot = KorvasieniAnnotator(sourceKeys = detIDList, connection = ensembl, inputDB = ".TranscriptId", inputType = "Transcript", targetDB = ".GeneId,.GeneName,.GeneDesc,GO") // Compile a table of differentially expressed transcripts detTable = TableQuery(table1 = detIDList, table2 = detTTest.pvalues, table3 = logratio, table4 = detAnnot.bioAnnotation, query = detTableQuery) // GO Clustering of the DETs goClusters = GOClustering(goAnnotations = detAnnot, force expr = tExpr.table, columnMargin = 0, dropUnknown = true, grayScale = false, asBitmap = true, heatmapScale = "row") // Report of differentially expressed transcripts. detReport = DEGReport(deg = det, force geneAnnotation = logratio, geneColumn = "Transcript", setPattern = "over|under", sectionTitle = "Summary of differentially expressed transcripts: fc > "+ fcThreshold+" and corrected p-value < "+pValueThreshold) detVennDiagram = VennDiagram(det, sets1 = "tumor_vs_normal_tTest,tumor_vs_normal_fcOver,tumor_vs_normal_fcUnder", names1 = "T-test,over,under", cexSetName = 0.4, cexSetSize = 0.4) // Quality control // Cumulative distribution for the samples cumDistReport = Plot2D(y = tExpr.table, caption = "Distribution of transcript expression values (log2) for the samples.", sectionTitle = "Ordered transcript expression values", legendPosition = "off", xLabel = "transcripts", yLabel = "intensity", title = "", sort = true, plotType = "l", imageType = "single", height = 11) // MDS plot of samples mdsReport = MDSPlot(force expr = tExpr.table, force groups = sampleGroups.groups, plotNames = true, sectionTitle = "Multidimensional scaling of the samples", cex = 0.2, legendPosition = "topright") // Compile quality control report qcReport = LatexCombiner(cumDistReport.plot, mdsReport.report) /** Survival analysis based on transcript expression. */ km = TranscriptSurvival(sampleGroups.groups, metadata, force tExpr.table, detIDList, detAnnot.bioAnnotation, fcThreshold = fcThreshold, annotCol = ".GeneId,.GeneName,.GeneDesc", title = "Survival analysis for transcript expression") // SPIA survGenes = CSVTransformer(km.statistics, transform1="csv1[csv1$pValue < 0.01, 'group']") survGenesEntrez = KorvasieniAnnotator(survGenes, connection=ensembl, inputDB=".TranscriptId", targetDB="_EntrezGene", maxHits=1) survLogratio = CSVFilter(logratio, survGenes) survLogratioEntrez = IDConvert(survLogratio, survGenesEntrez, conversionColumn="EntrezGene", originalWhenMissing=false, unique=true) survLogratioEntrezFilt = CSVFilter(survLogratioEntrez, nonMissing=2) allEntrez = KorvasieniAnnotator(logratio, connection=ensembl, inputDB=".TranscriptId", targetDB="_EntrezGene", maxHits=1) reference = CSV2IDList(allEntrez, columnIn="EntrezGene", @execute="once") spia = SPIA(deg=survLogratioEntrezFilt, reference=reference) exonSPIArefs = INPUT(path="csv/exonSPIArefs.csv") spiaReport = CSV2Latex(spia.pathways, refs = exonSPIArefs, columns = "Name,ID,pSize,NDE,tA,pNDE,pPERT,pG,pGFdr,pGFWER,Status", ruler = "{}", numberFormat = "tA=#0.0000,pNDE=#0.0000,pG=#0.0000,pGFdr=#0.0000,pGFWER=#0.0000", section = "Signalling pathway impact analysis for differentially expressed transcripts "+ "with survival p $<$ 0.01 ", sectionType = "section", pageBreak = true, caption = "SPIA analysis ~\\cite{Tarca2008}. Name is the KEGG name for the pathway, and "+ "ID is the KEGG internal identication for the pathway. "+ "pSize is total number of genes in the pathway according to KEGG, where as "+ "NDE is the number of diffentially expressed genes on the pathway. pNDE is the "+ "p-value to observe NDE number of genes on the pathway by chance. tA is a "+ "perturbation factor defined by SPIA algorithm and pPERT is the probability of "+ "observing a more intense perturbation than tA by chance. pG is the combination "+ "of pPERT and pNDE. pGFdr is the false discovery rate and pGFWER the "+ "Bonferoni corrected p-value.") // PDF report report = LatexCombiner(detReport.summary, detVennDiagram.report, qcReport.document, goClusters.report, km.report, spiaReport.report) geneMapping = CSVFilter(transcriptAnnotation, includeColumns="Transcript,Gene") /** Transcript result data processing. */ transcriptStatisticsQuery = INPUT(path="sql/TranscriptStatistics.sql") allProbes = TableQuery(geneMapping, logratio.expr, detTTest.pvalues, query = transcriptStatisticsQuery) return record(report = report.document, det = detTable.table, exprIndicator = km.sampleExpressionIndicator, survival = km.survivalTable, kaplanMeier = km.report, kaplanMeierStatistics = km.statistics, allProbes = allProbes.table) } function TranscriptSurvival(SampleGroupTable sampleGroups, CSV patientMetadata, LogMatrix logratio, optional IDList det, optional AnnotationTable detAnnotation, float fcThreshold, string title, float pLimit = 0.05, int minCount = 20, string annotCol = "*") -> (Latex report, CSV statistics, CSV survivalTable, Matrix sampleExpressionIndicator){ /** Generates a table of samples and their survival times. */ survivalQuery = INPUT(path="sql/exonSurvival.sql") sampleNames = ExpandCollapse(sampleGroups) survival = TableQuery(sampleNames, patientMetadata, query=survivalQuery) sampleExpression = SampleExpression(force logratio, sampleGroups, threshold = fcThreshold, referenceGroup = "normal_tissue") if (det == null) { sampleExpressionFilt = sampleExpression.indicator } else { sampleExpressionFilt = CSVFilter(sampleExpression.indicator, det) } sampleExpressionTrans = MatrixTranspose(sampleExpressionFilt, rowName="SampleID") survivalTable = CSVJoin(survival.table, sampleExpressionTrans) survivalTableConv = IDConvert(survivalTable, survival.table, sourceColumn = "SampleID", conversionColumn = "PatientID", unique = true, collapseNumeric = "indicator") km = KaplanMeier(survivalTableConv.csv, detAnnotation, statusCol = "VitalStatus", timeCol = "SurvivalEndTime", groupCol = "*", eventSymbol = "DECEASED", PLimit = pLimit, minCount = minCount, timeOutLimit = 36, excludeCol = "SampleID,PatientID", annotCol = annotCol) kmReport = LatexCombiner(km.report, sectionTitle = title, pagebreak = true, @keep = false) return record(report = kmReport.document, statistics = km.statistics, survivalTable = survival.table, sampleExpressionIndicator = sampleExpression.indicator) }