This tutorial demonstrates how to use GeomxTools for preprocessing protein or proteogenomics data.
Data processing is very similar to what is shown in the Developer_Introduction_to_the_NanoStringGeoMxSet and GeoMx Workflow vignettes with a couple of protein specific functions.
library(GeomxTools)
GeoMxSet objects can only read in one analyte at a time. With protein or proteogenomics data, the desired analyte must be added to the call to read in the object. RNA is the default analyte.
datadir <- system.file("extdata","DSP_Proteogenomics_Example_Data",
package = "GeomxTools")
DCCFiles <- unzip(zipfile = file.path(datadir, "/DCCs.zip"))
PKCFiles <- unzip(zipfile = file.path(datadir, "/pkcs.zip"))
SampleAnnotationFile <- file.path(datadir, "Annotation.xlsx")
RNAData <- suppressWarnings(readNanoStringGeoMxSet(dccFiles = DCCFiles,
pkcFiles = PKCFiles,
phenoDataFile = SampleAnnotationFile,
phenoDataSheet = "Annotations",
phenoDataDccColName = "Sample_ID",
protocolDataColNames = c("Tissue",
"Segment_Type",
"ROI.Size"),
configFile = NULL,
analyte = "RNA",
phenoDataColPrefix = "",
experimentDataColNames = NULL))
proteinData <- suppressWarnings(readNanoStringGeoMxSet(dccFiles = DCCFiles,
pkcFiles = PKCFiles,
phenoDataFile = SampleAnnotationFile,
phenoDataSheet = "Annotations",
phenoDataDccColName = "Sample_ID",
protocolDataColNames = c("Tissue",
"Segment_Type",
"ROI.Size"),
configFile = NULL,
analyte = "protein",
phenoDataColPrefix = "",
experimentDataColNames = NULL))
RNAData <- aggregateCounts(RNAData)
RNAData
## NanoStringGeoMxSet (storageMode: lockedEnvironment)
## assayData: 18677 features, 84 samples
## element names: exprs
## protocolData
## sampleNames: DSP-1001900002618-G-A02.dcc DSP-1001900002618-G-A03.dcc
## ... DSP-1001900002618-G-H01.dcc (84 total)
## varLabels: FileVersion SoftwareVersion ... ROI.Size (17 total)
## varMetadata: labelDescription
## phenoData
## sampleNames: DSP-1001900002618-G-A02.dcc DSP-1001900002618-G-A03.dcc
## ... DSP-1001900002618-G-H01.dcc (84 total)
## varLabels: Plate Well ... NegGeoSD_Hs_R_NGS_WTA_v1.0 (13 total)
## varMetadata: labelDescription
## featureData
## featureNames: A2M NAT2 ... CST2 (18677 total)
## fvarLabels: TargetName Module ... Negative (6 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: Hs_R_NGS_WTA_v1.0.pkc
## signature: none
## feature: Target
## analyte: RNA
#Protein Data is aggregated automatically on readin
proteinData
## NanoStringGeoMxSet (storageMode: lockedEnvironment)
## assayData: 147 features, 84 samples
## element names: exprs
## protocolData
## sampleNames: DSP-1001900002618-G-A02.dcc DSP-1001900002618-G-A03.dcc
## ... DSP-1001900002618-G-H01.dcc (84 total)
## varLabels: FileVersion SoftwareVersion ... ROI.Size (17 total)
## varMetadata: labelDescription
## phenoData
## sampleNames: DSP-1001900002618-G-A02.dcc DSP-1001900002618-G-A03.dcc
## ... DSP-1001900002618-G-H01.dcc (84 total)
## varLabels: Plate Well ... Y (11 total)
## varMetadata: labelDescription
## featureData
## featureNames: Ms IgG1 CD45 ... ADAM10 (147 total)
## fvarLabels: RTS_ID TargetName ... Negative (8 total)
## fvarMetadata: labelDescription
## experimentData: use 'experimentData(object)'
## Annotation: Hs_P_NGS_ADPath_Ext_v1.0.pkc Hs_P_NGS_ADPath_v1.0.pkc Hs_P_NGS_Autophagy_v1.0.pkc Hs_P_NGS_CellDeath_v1.0.pkc Hs_P_NGS_Core_v1.0.pkc Hs_P_NGS_GlialSubtype_v1.0.pkc Hs_P_NGS_IODrugTarget_v1.0.pkc Hs_P_NGS_ImmuneActivation_v1.0.pkc Hs_P_NGS_ImmuneCellTyping_v1.0.pkc Hs_P_NGS_MAPK_v1.1.pkc Hs_P_NGS_Myeloid_v1.0.pkc Hs_P_NGS_NeuralCellTyping_v1.0.pkc Hs_P_NGS_PDPath_v1.0.pkc Hs_P_NGS_PI3K_AKT_v1.0.pkc Hs_P_NGS_PanTumor_v1.0.pkc
## signature: none
## feature: Target
## analyte: Protein
By having the datasets split by analyte, each object can go through the typical QC and normalization steps specific to that analyte.
For RNA please refer to the introduction or GeoMx Workflow vignettes.
After reading in the object, we will do one QC step: flag and remove low quality ROIs
proteinData <- setSegmentQCFlags(proteinData, qcCutoffs = list(percentSaturation = 45,
minSegmentReads=1000,
percentAligned=80,
minNegativeCount=10,
maxNTCCount=60,
minNuclei=16000,
minArea=20))
# low sequenced ROIs
lowSaturation <- which(as.data.frame(protocolData(proteinData)[["QCFlags"]])["LowSaturation"] == TRUE)
# remove low quality ROIs
passedQC <- proteinData[, -lowSaturation]
dim(proteinData)
## Features Samples
## 147 84
dim(passedQC)
## Features Samples
## 147 82
Housekeepers and negative controls (IgGs) can easily be pulled out of the dataset.
hk.names <- hkNames(proteinData)
hk.names
## [1] "Histone H3" "GAPDH" "S6"
igg.names <- iggNames(proteinData)
igg.names
## [1] "Ms IgG1" "Ms IgG2a" "Rb IgG"
For the target QC step, we identify proteins with potentially little useful signal using this figure.
fig <- qcProteinSignal(object = proteinData, neg.names = igg.names)
proteinOrder <- qcProteinSignalNames(object = proteinData, neg.names = igg.names)
genesOfInterest <- c(which(proteinOrder == "Tyrosine Hydroxylase"),
which(proteinOrder == "ApoA-I"),
which(proteinOrder == "EpCAM"))
fig()
rect(xleft = 0, xright = 4,
ybottom = -2, ytop = 2, density = 0, col = "#1B9E77", lwd = 2)
rect(xleft = genesOfInterest[1]-1, xright = genesOfInterest[1]+1,
ybottom = -2, ytop = 1.25, density = 0, col = "#D95F02", lwd = 2)
rect(xleft = genesOfInterest[2]-1, xright = genesOfInterest[2]+1,
ybottom = -1, ytop = 3, density = 0, col = "#66A61E", lwd = 2)
rect(xleft = genesOfInterest[3]-1, xright = genesOfInterest[3]+1,
ybottom = -3, ytop = 6.5, density = 0, col = "#E7298A", lwd = 2)