It’s very convenient to get datasets from UCSC Xena. Normally, datasets will need to be processed and transformed with R language.

Herein, I will recover the protocol of datasets processed.

Methods

Prerequisites

1
2
3
4
library(openxlsx)
library(tidyverse)
library(limma)
library(readr)

Annotation of gene

  • Load data and annotation information
1
2
3
4
5
6
TCGA_rawdata <- read_tsv("/Users/xiaonili/Downloads/TCGA-HNSC.htseq_counts.tsv.gz")

dim(TCGA_rawdata)

probeMap <- read.table("/Users/xiaonili/Downloads/gencode.v22.annotation.gene.probeMap",sep = "\t" , header = T)
probeMap[1:4,1:4]
  • Output

    fig2

  • ID Reverse

1
2
3
4
5
TCGA_gset <- TCGA_rawdata %>%
  inner_join(probeMap, by = c("Ensembl_ID" = "id")) %>%
  select(gene, starts_with("TCGA") )

TCGA_gset[1:4,1:4]
  • Output

    fig3

  • Average replicate genes

1
2
3
4
TCGA_gset = as.data.frame(avereps(TCGA_gset[,-1],ID = TCGA_gset$gene))
colnames(TCGA_gset) <- substring(colnames(TCGA_gset),1,15) %>% gsub("-",".",.)
write.csv(TCGA_gset,"/Users/xiaonili/Downloads/TCGA_HNSC_Countdata_log2+1.csv")
TCGA_gset[1:4,1:4]
  • Output

    fig4

  • Group by patient.id

1
2
3
4
TCGA_group_list <- ifelse(as.numeric(substring(colnames(TCGA_gset),14,15)) < 10,
                          "Tumor","Normal") %>% 
  factor(.,levels = c("Normal","Tumor"))
table(TCGA_group_list)
  • Output

    fig5

Recognize mRNA lncRNA and miRNA

  • Load data
1
2
3
mRNA_info <- read.xlsx("/Users/xiaonili/Downloads/Gene_info.xlsx",sheet = "mRNA_info")
lncRNA_info <- read.xlsx("/Users/xiaonili/Downloads/Gene_info.xlsx",sheet = "lncRNA_info")
miRNA_info <- read.xlsx("/Users/xiaonili/Downloads/Gene_info.xlsx",sheet = "miRNA_info")
  • Get geneset for mRNA miRNA and lncRNA
 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
## Get data.matrix for mRNA
mRNA_gset <- TCGA_gset[rownames(TCGA_gset) %in% mRNA_info$gene_name,]
dim(mRNA_gset)

write.csv(mRNA_gset,"/Users/xiaonili/Downloads/TCGA_HNSC_mRNA.csv",quote = F,row.names = T)

## Get data.matrix for lncRNA
lncRNA_gset <- TCGA_gset[rownames(TCGA_gset) %in% lncRNA_info$gene_name,]
dim(lncRNA_gset)

write.csv(lncRNA_gset,"/Users/xiaonili/Downloads/TCGA_HNSC_lncRNA.csv",quote = F,row.names = T)

## Get data.matrix for miRNA
miRNA_gset <- TCGA_gset[rownames(TCGA_gset) %in% miRNA_info$gene_name,]
dim(miRNA_gset)

write.csv(miRNA_gset,"/Users/xiaonili/Downloads/TCGA_HNSC_miRNA.csv",quote = F,row.names = T)

Match clinical and survival information with expression

  • Load clinical data
1
2
3
4
5
6
7
Phenodata <- read_tsv("/Users/xiaonili/Downloads/TCGA-HNSC.GDC_phenotype.tsv.gz")

Phenodata[1:4,1:4]

Phenodata$submitter_id.samples <- substring(Phenodata$submitter_id.samples,1,15) %>% 
  gsub("-",".",.)
Phenodata[1:4,1:4]
  • Output

    fig6

  • Load survival data

1
2
3
4
Sur_data <- read_tsv("/Users/xiaonili/Downloads/TCGA-HNSC.survival.tsv.gz")

Sur_data$sample <- substring(Sur_data$sample,1,15) %>% gsub("-",".",.)
Sur_data[1:4,1:4]
  • Output

    fig7

  • Merge data and choose interested col

1
2
3
4
5
Phen_surv <- Phenodata %>%
  inner_join(Sur_data,by = c("submitter_id.samples" = "sample")) %>%
  select(submitter_id.samples,age_at_index.demographic,gender.demographic,
         tumor_grade.diagnoses,neoplasm_histologic_grade,tumor_stage.diagnoses,OS,OS.time)
head(Phen_surv)
  • Output

    fig8

  • match expression with phenodata and do order

1
2
Phen_surv = Phen_surv[match(colnames(TCGA_gset),Phen_surv$submitter_id.samples),]
identical(Phen_surv$submitter_id.samples,colnames(TCGA_gset))
  • group
1
2
3
4
Phen_surv$group <- TCGA_group_list
Phen_surv = dplyr::select(Phen_surv,submitter_id.samples,group,everything())
write.csv(Phen_surv,"/Users/xiaonili/Downloads/TCGA_HNSC_phenotype.csv")
head(Phen_surv)
  • Output

    fig9

Attach is the script.