// Andril Case Study - miRNA EXPRESSION AND SURVIVAL ANALYSIS // author: Viljami Aittomaki (adapted from code by Marko Laakso) function MiRNAAnalysis(LogMatrix mExprNormalized, CSV probeAnnot, CSV miRNANames, CSV miRNATargets, float fcThreshold = 2.0, float pValueThreshold = 0.05, float survivalThreshold = 0.1, float varFilterThreshold = 0, float miRNATargetThreshold = 0.00001) -> (Latex report, CSV dems, Matrix exprIndicator, CSV survival, Latex kaplanMeier, CSV kaplanMeierStatistics) { // Create miRNA annotation table filteredMiRNATargets = CSVFilter(csv = miRNATargets, highBound = "PVALUE_OG="+miRNATargetThreshold, includeColumns = "SEQ,EXTERNAL_NAME,TRANSCRIPT_ID", rename = "SEQ=miRNAName,EXTERNAL_NAME=targetGeneName,TRANSCRIPT_ID=targetTranscriptId") collapsedMiRNATargets = ExpandCollapse(relation = filteredMiRNATargets, expand = false, listCols = "targetGeneName,targetTranscriptId") collapsedMiRNANames = ExpandCollapse(relation = miRNANames, expand = false, listCols = "stemLoopId,stemLoopName,stemLoopDescription") miRNAAnnot = CSVJoin(csv1 = collapsedMiRNANames, csv2 = collapsedMiRNATargets, useKeys = true, intersection = false, keyColumnNames = "miRNAName,miRNAName") /** List of samples without clinical annotation. */ samplesWithoutClinicalAnnotation = INPUT(path="csv/samplesWithoutClinicalAnnotation.csv") /** R Script for filtering out samples without clinical annotation. */ filterSamplesRScript = INPUT(path="r/filterSamples.R") /** Filter samples without clinical annotation. */ sampleFilter = REvaluate(table1 = mExprNormalized, table2 = samplesWithoutClinicalAnnotation, script = filterSamplesRScript) /* Join duplicate probes and filter low variance rows. */ probeJoin = IDConvert(force sampleFilter.table, probeAnnot, collapseNumeric = "median", conversionColumn = "miRNAName", sourceColumn = "AgilentProbe", targetColumn = "miRNAName", unique = true) mExpr = VariationFilter(matrix = probeJoin, threshold = varFilterThreshold) /** Map sample names into their clinical categories. */ sampleGroups = SampleGroupCreator(mExpr, pattern1 = "TCGA_[0-9]{2}_[0-9]{4}_01", definition1 = "tumor_solid,mean,Solid tumor samples", pattern2 = "TCGA_[0-9]{2}_[0-9]{4}_11", definition2 = "normal_tissue,mean,Normal tissue samples", pattern3 = "tumor_solid,normal_tissue", definition3 = "tumor_vs_normal,ratio,fold change of tumor median vs. normal median", patternTypes = "re,re,verbatim") // EXPRESSION ANALYSIS /** Compute medians of categories and logratios between. */ groupMeans = SampleCombiner(expr = mExpr, groups = sampleGroups.groups, groupIDs = "tumor_solid,normal_tissue") logratio = SampleCombiner(expr = groupMeans.expr, groups = sampleGroups.groups, groupIDs = "tumor_vs_normal") /** Calculate differentially expressed genes based on fold change. */ demFoldChange = FoldChange(logratio = logratio, threshold = fcThreshold) /** Calculate differentially expressed genes based on t-test. */ tTest = StatisticalTest(matrix = mExpr, groups = sampleGroups.groups, correction = "fdr", targetColumns = "tumor_solid", referenceColumns = "normal_tissue", outputSet = "tumor_vs_normal_tTest", threshold = pValueThreshold) /** Selection of differentially expressed genes by their fold change and t-test calls. */ demTransformation = INPUT(path="csv/FoldChangeAndTTest.csv") /** Take differentially expressed miRNAs based on fold change and t-test. */ demSet = SetTransformer(transformation = demTransformation, set1 = demFoldChange.deg, set2 = tTest.idlist, includeOriginal = true) dems = CSV2IDList(demSet, regexp1 = "ID=over|under", columnIn = "Members", columnOut = "miRNAName", isList = true) /** This query combines miRNA fold changes and p-values with annotations. */ miRNAStatQuery = INPUT(path="sql/miRNAStatistics.sql") miRNAStat = TableQuery(table1 = miRNAAnnot, table2 = logratio, table3 = tTest.pvalues, query = miRNAStatQuery) miRNAStatGenes = KorvasieniAnnotator(miRNAStat, ensembl, inputDB=".TranscriptId", targetDB=".GeneId", keyColumn="targetTranscriptId", isListKey=true, unique=true) miRNAStatBand = KorvasieniAnnotator(miRNAStat, ensembl, inputDB=".GeneName", targetDB=".DNABand", keyColumn="miRNAName", isListKey=false, echoColumns="*",rename=".DNABand=DNABand") miRNAStatConverted = IDConvert(miRNAStatBand, miRNAStatGenes.bioAnnotation, sourceColumn="targetTranscriptId", conversionColumn=".GeneId", split=true, originalWhenMissing=true, targetColumn="GeneID") miRNAStatFiltered = CSVFilter(miRNAStatConverted.csv, includeColumns="miRNAName,miRNAId,FoldChange,PValueCorrected,stemLoopName,GeneID,DNABand", rename="PValueCorrected=PValue,GeneID=TargetGenes") /** Report of differentially expressed miRNAs. */ demReport = DEGReport(deg = demSet, geneAnnotation = miRNAAnnot, geneColumn = "stemLoopName", expr = logratio, setPattern = "over|under", sectionTitle = "Summary of differentially expressed miRNAs", sectionType = "section", columns = 8) /** Venn diagram of over, under and statistically significant sets. */ demVennDiagram = VennDiagram(demSet, sets1 = "tumor_vs_normal_tTest,tumor_vs_normal_fcOver,tumor_vs_normal_fcUnder", names1 = "p<"+pValueThreshold+",overExpressed,underExpressed", sectionType = "subsection", sectionTitle = "Venn diagram of differentially expressed and statistically significant miRNAs") // QUALITY CONTROL /** Cumulative distribution for the samples. */ cumDistReport = Plot2D(y = mExpr, caption = "Cumulative distributions of miRNA expression values for the samples.", sectionTitle = "Ordered miRNA expression values", sectionType = "subsubsection", legendPosition = "off", xLabel = "transcripts", yLabel = "intensity", title = "", sort = true, plotType = "l", imageType = "single", height = 11) /** MDS plot of samples. */ mds = MDSPlot(force expr = mExpr, groups = sampleGroups.groups, plotNames = true, sectionTitle = "Multidimensional scaling of the samples", sectionType = "subsubsection", cex = 0.2, legendPosition = "topright") /** Compile quality control report. */ qcReport = LatexCombiner(cumDistReport, mds.report, sectionTitle = "Expression quality control", sectionType = "subsection") // SURVIVAL ANALYSIS /** Survival data */ patientMetadata = INPUT(path = patientMetadataFileName) sampleNames = ExpandCollapse(sampleGroups.groups) /** SQL query to survival data preprocessing. */ survivalQuery = INPUT(path="sql/exonSurvival.sql") survival = TableQuery(table1 = sampleNames, table2 = patientMetadata, query = survivalQuery) sampleExpression = SampleExpression(force mExpr, sampleGroups.groups, threshold = fcThreshold, referenceGroup = "normal_tissue") sampleExpressionFilt = CSVFilter(sampleExpression.indicator, dems) sampleExpressionTrans = MatrixTranspose(sampleExpressionFilt, rowName = "SampleID") survivalTable = CSVJoin(survival.table, sampleExpressionTrans) survivalTableConv = IDConvert(survivalTable, survival.table, sourceColumn = "SampleID", conversionColumn = "PatientID", unique = true, collapseNumeric = "indicator") survivalAnnot = CSVFilter(miRNAAnnot, includeColumns="miRNAName,targetGeneName") km = KaplanMeier(survivalTableConv.csv, force survivalAnnot, statusCol = "VitalStatus", timeCol = "SurvivalEndTime", groupCol = "*", eventSymbol = "DECEASED", PLimit = survivalThreshold, timeOutLimit = 36, minCount = 20, excludeCol = "SampleID,PatientID,VitalStatus,SurvivalEndTime,AgeAtProcedure") /** MiRNA expression survival analysis report. */ kmReport = LatexCombiner(km.report, sectionTitle = "Survival analysis for miRNA expression") /** Compile report of miRNA analysis. */ report = LatexCombiner(demReport.summary, demVennDiagram.report, qcReport.document, kmReport.document, demReport.genelist) return record( report = report, dems = miRNAStatFiltered.csv, exprIndicator = sampleExpression.indicator, survival = survival.table, kaplanMeier = km.report, kaplanMeierStatistics = km.statistics ) }