############################################# # Thesis Chapter 2 ############################################# mkdir ThesisMaster mkdir Chapter2 && cd Chapter2 mkdir input output genome preprocessed export wd=~/ThesisMaster/Chapter2 export output=$wd/output export genome=$wd/genome export input=$wd/input export CPU=70 #link raw data ls /data/SIRG/TotalRNA/untrimmed/total-rna-*txt.gz |while read line; do ln -s $line input/ ; done #create sample file ls $input |cut -f 1 -d '_' |sort|uniq >sampleFile.txt export samples=$wd/sampleFile.txt ########### Section 1 - Preprocessing export trimmed=$wd/preprocessed mkdir $output/fastqcRaw fastqc $input/* -t $CPU -o $output/fastqcRaw cat $samples |while read line ;do cutadapt -m 20 -q 10,10 -A AGATCGGAAGAGC -a AGATCGGAAGAGC -o $trimmed/"$line"_1.fq.gz -p $trimmed/"$line"_2.fq.gz $input/"$line"_1_sequence.txt.gz $input/"$line"_2_sequence.txt.gz & done mkdir $output/fastqcTrimmed fastqc $trimmed/* -t $CPU -o $output/fastqcTrimmed # To conserve space and reduce redundancy on the file system, all reads and preprocessed reads and fastqcs were moved to /data/SIRG/TotalRNA and symlinked to these directories ############# Section 2 - Preparing Genome #Download genome and gtf from ensembl wget ftp://ftp.ensembl.org/pub/release-93/fasta/cricetulus_griseus_crigri/dna/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa.gz -O $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa.gz wget ftp://ftp.ensembl.org/pub/release-93/gtf/cricetulus_griseus_crigri/Cricetulus_griseus_crigri.CriGri_1.0.93.gtf.gz -O $genome/Cricetulus_griseus_crigri.CriGri_1.0.93.gtf.gz gunzip $genome/* hisat2-build $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa -p $CPU ############# Section 3 - Alignment mkdir $output/alignment cat $samples | while read file ; do hisat2 --dta --rna-strandness RF -p 4 -t -x $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa -1 $trimmed/"$file"_1.fq -2 $trimmed/"$file"_2.fq |samtools view -bS |samtools sort > $output/alignment/"$file".sorted.bam & done 2> $output/alignment/trimlog.txt ############ Section 4 - Counting gene expression mkdir $output/counts #Count with featurecounts featureCounts -p -a $genome/Cricetulus_griseus_crigri.CriGri_1.0.93.gtf -o $output/counts/fcgene.all -s 2 -T 64 $output/alignment/total-rna-1.sorted.bam $output/alignment/total-rna-2.sorted.bam $output/alignment/total-rna-3.sorted.bam $output/alignment/total-rna-4.sorted.bam $output/alignment/total-rna-5.sorted.bam $output/alignment/total-rna-6.sorted.bam $output/alignment/total-rna-7.sorted.bam $output/alignment/total-rna-8.sorted.bam #Reformat for R cd $output/counts cat $output/counts/fcgene.all| grep -v '^#' |awk -F '\t' '{print $1"\t"$7"\t"$8"\t"$9"\t"$10"\t"$11"\t"$12"\t"$13"\t"$14}' | sed 1d>tmp printf '\t' >header.tmp cat $samples| tr '\n' '\t' >>header.tmp printf '\n' >>header.tmp cat header.tmp tmp >allcounts.fc sed -i 's/-/_/g' allcounts.fc rm *tmp* ########### Section 5 - Differential expression R library(DESeq2) counts<-read.table("allcounts.fc") #Create experimental design table SE<-cbind (c("NTS","NTS","NTS","NTS","TS","TS","TS","TS")) rownames(SE)<-c("total_rna_1","total_rna_2","total_rna_3","total_rna_4","total_rna_5","total_rna_6","total_rna_7","total_rna_8") colnames(SE)<-"Condition" #Exclude read counts of genes below 1cpm minLib<-min(colSums(counts)) over1CPM<- rowSums(counts) > minLib/1000000*6 counts<-counts[over1CPM,] #Differential expression test dds<-DESeqDataSetFromMatrix(countData=counts, colData=as.data.frame(SE), design=~Condition) dds$Condition <- relevel( dds$Condition, ref="NTS") dds<-DESeq(dds) #Filter for only significant results res<-results(dds, alpha= 0.05, lfcThreshold=0) res <- na.omit(res) sig<-res[res[,6]<=0.05,] #Filter for only 1.5fold change sig<-sig[abs(sig[,2])>(log2(1.5)),] sig <- (sig[order (sig$padj),]) #Write results to file (samples named so that down regulated means downregulated at 31oC) write.table(sig[sig[,2]>0,], file="upregulated1.5.tsv", sep="\t" ) write.table(sig[sig[,2]< -0,], file="downregulated1.5.tsv", sep="\t" ) #filter for 1.2 fold for proteomics overlaps sig<-res[res[,6]<=0.05,] #Filter for only 1.2fold change sig<-sig[abs(sig[,2])>(log2(1.2)),] sig <- (sig[order (sig$padj),]) #Write results to file (samples named so that down regulated means downregulated at 31oC) write.table(sig[sig[,2]>0,], file="upregulated1.2.tsv", sep="\t" ) write.table(sig[sig[,2]< -0,], file="downregulated1.2.tsv", sep="\t" ) quit() y cd .. mkdir DEGs mv $output/counts/*tsv DEGs/ #Created GeneId lists sed 1d $output/DEGs/upregulated1.5.tsv |cut -f 2 -d '"' > $output/DEGs/upregulated1.5GeneIds.txt sed 1d $output/DEGs/downregulated1.5.tsv |cut -f 2 -d '"' > $output/DEGs/downregulated1.5GeneIds.txt ############### - Section 6 - Proteomics Overlap #Differential protein expression lists were provided by Orla Coleman #Results were parsed into the files upProteomics, downProteomics and detectedProteomics.txt and saved in $input cat $input/downProteomics.txt |cut -f 1|sort |uniq> $input/downProteomicsGenes.txt cat $input/upProteomics.txt |cut -f 1|sort |uniq> $input/upProteomicsGenes.txt cat $output/DEGs/*1.5GeneIds.txt |sort |uniq > $output/DEGs/DE1.5GeneIds.txt sort $output/DEGs/upregulated1.5GeneIds.txt >$output/DEGs/upregulated1.5GeneIds.sorted.txt sort $output/DEGs/downregulated1.5GeneIds.txt >$output/DEGs/downregulated1.5GeneIds.sorted.txt mkdir $output/proteomicsOverlap comm -12 $output/DEGs/DE1.5GeneIds.txt input/detectedProteomics.txt > $output/proteomicsOverlap/deGenesDetectedInProteomics.txt #1 - what upregulated and downregulated genes were found DE in proteomics? #21 up 17 down comm -12 $output/DEGs/downregulated1.5GeneIds.sorted.txt $input/downProteomicsGenes.txt >$output/proteomicsOverlap/downInBothGeneAndProteomics.txt comm -12 $output/DEGs/upregulated1.5GeneIds.sorted.txt $input/upProteomicsGenes.txt >$output/proteomicsOverlap/upInBothGeneAndProteomics.txt #2 - what up and down regulated genes were profiled in proteomics, but not DE? # 31 up 65 down comm -12 $output/DEGs/DE1.5GeneIds.txt $input/detectedProteomics.txt > $output/proteomicsOverlap/deGenesDetectedInProteomics.txt cat $input/upProteomicsGenes.txt $input/downProteomicsGenes.txt |sort |uniq >$output/proteomicsOverlap/deProteomics.txt comm -23 $output/proteomicsOverlap/deGenesDetectedInProteomics.txt $output/proteomicsOverlap/deProteomics.txt >$output/proteomicsOverlap/deGenesDetectedInButNotDeProteomics.txt comm -12 $output/proteomicsOverlap/deGenesDetectedInButNotDeProteomics.txt $output/DEGs/upregulated1.5GeneIds.sorted.txt > $output/proteomicsOverlap/upGenesDetectedInButNotDeProteomics.txt comm -12 $output/proteomicsOverlap/deGenesDetectedInButNotDeProteomics.txt $output/DEGs/downregulated1.5GeneIds.sorted.txt > $output/proteomicsOverlap/downGenesDetectedInButNotDeProteomics.txt #3 were any up or down genes found to be down or up respectively in proteomics #1 up gene 5 down comm -12 $output/DEGs/upregulated1.5GeneIds.sorted.txt $input/downProteomicsGenes.txt > $output/proteomicsOverlap/upGeneDownProtein.txt comm -12 $output/DEGs/downregulated1.5GeneIds.sorted.txt $input/upProteomicsGenes.txt > $output/proteomicsOverlap/downGeneUpProtein.txt #4 What genes were DE in the proteomics but not at 1.2 fold in the gene lists? #119 up, 99 down cat $output/DEGs/upregulated1.2.tsv |cut -f 2 -d '"' |sort |uniq > $output/DEGs/upregulated1.2.GeneIDs.sorted.tsv cat $output/DEGs/downregulated1.2.tsv |cut -f 2 -d '"' |sort |uniq > $output/DEGs/downregulated1.2.GeneIDs.sorted.tsv comm -23 $input/downProteomicsGenes.txt $output/DEGs/downregulated1.2.GeneIDs.sorted.tsv >$output/proteomicsOverlap/downProteinNotGene.txt comm -23 $input/upProteomicsGenes.txt $output/DEGs/upregulated1.2.GeneIDs.sorted.tsv > $output/proteomicsOverlap/upProteinNotGene.txt #################### #Figures unaligned<-c(12.8132499,13.801188,13.39746653,12.89476575,14.95380812,15.6152412,15.39754033,15.8696213) rRNA<-c(0.15,0.14,0.12,0.12,0.19,0.19,0.33,0.29) mitochondrial<-c(0.87,0.96,0.97,0.92,0.98,0.96,1.34,1.06) unassigned<-c(33.76797611,30.39140049,33.12347776,30.42209611,28.44175765,31.668039,28.38384169,26.77983239) assigned<-c(52.39877399,54.70741152,52.38905571,55.64313814,55.43443423,51.5667198,54.54861798,56.0005463) readData<-cbind (unaligned,rRNA,mitochondrial,unassigned,assigned) rownames(readData)<-c("NTS_1","NTS_2","NTS_3","NTS_4","TS_1","TS_2","TS_3","TS_4") barplot(t(readData[,5:1]), col=brewer.pal(11,"Spectral")[c(9,10,11,3,1)], xlab="", ylab= "Reads (%)",ylim=c(0,100), las=3) title(xlab="Sample",mgp=c(4,1,0)) legend (x="topright", legend=c("Unaligned","rRNA","Mitochondrial","Aligned","Assigned"), inset=c(-0.27,-0), col=brewer.pal(11,"Spectral")[c(1,3,11,10,9)], pch=20) ### FC<-cbind(c("Lmnb1","Cdk1","Hnrnpc","H2afj","Lmnb1","Cdk1","Hnrnpc","H2afj"),c("Gene","Gene","Gene","Gene","Protein","Protein","Protein","Protein"),c(-1.89,-1.9,-1.66,1.93,1.71,1.21,1.25,-1.35)) colnames(FC)<-c("Gene","Experiment","FoldChange") FC<-as.data.frame(FC) FC$FoldChange=as.numeric(levels(FC$FoldChange))[FC$FoldChange] ggplot(FC,aes(Gene, FoldChange, fill=Experiment))+ geom_bar(stat="identity", position="dodge")+ scale_fill_brewer(palette="Set1") ################################### # Thesis Chapter 3 ################################### cd ~/ThesisMaster mkdir Chapter3 && cd Chapter3 mkdir input output preprocessed export wd=~/ThesisMaster/Chapter3 export output=$wd/output export genome=~/ThesisMaster/Chapter2/genome export input=$wd/input export CPU=70 ############# Section 1 - Preparing Inputs #CHO pathways of interest annotated by GO ID moved to input dir. cp ../Chapter2/sampleFile.txt . less $wd/../Chapter2/output/DEGs/downregulated1.5GeneIds.sorted.txt $wd/../Chapter2/output/DEGs/upregulated1.5GeneIds.sorted.txt|sort |uniq >$input/DEgenes.txt ############# Section 2 - Stringtie assembly #No alignment or preprocessing is needed as we use the bam files from Chapter 2 cat sampleFile.txt |while read file ; do stringtie -G $genome/Cricetulus_griseus_crigri.CriGri_1.0.93.gtf -p 70 -B -j 5 -o $output/stringtie/"$file".gtf $wd/../Chapter2/output/alignment/"$file".sorted.bam ; done #Merge the novel transcripts assembled in each file into a single gtf using the ensembl gtf as master reference stringtie --merge -G $genome/Cricetulus_griseus_crigri.CriGri_1.0.93.gtf -c 10 -f 0.1 -o $output/stringtie/merged.gtf -p 70 $output/stringtie/*gtf ############# Section 3 - Quantification of transcripts with Stringtie #-B option returned ballgown tables, -e to estimate abundance only with no novel transcript discovery cat sampleFile.txt | while read file ; do stringtie -G $output/stringtie/merged.gtf -p 70 -B -e -o $output/quant/"$file"/"$file".gtf $wd/../Chapter2/output/alignment/"$file".sorted.bam ; done ############# Section 4 - Differential isoform expression with ballgown R setwd("output/quant/") library(ballgown) #import the expression data into a ballgown object bg<-ballgown(dataDir=getwd(), samplePattern="total-rna", meas='all') #Assign the sample data information pData(bg)= data.frame(id=sampleNames(bg), group=rep(c(1,0), each=4)) #Create transcript gene table as a key to go between stringtie and ballgown ids. Save for later transcript_gene_table=indexes(bg)$t2g write.table(transcript_gene_table, "transcriptGeneTable.txt") #Check the sample info table phenotype_table=pData(bg) #Test for differential transcript usage stat_results<-stattest(bg,feature='transcript' , meas='FPKM', covariate='group') #Work out the fold changes and add the log2 fold change as an extra coloumn to results transcriptFPKM<-texpr(bg, 'FPKM') for (i in 1:nrow(transcriptFPKM)) stat_results[i,5]=log2((mean(transcriptFPKM[i,5:8]))/(mean(transcriptFPKM[i,1:4]))) colnames(stat_results)<- c("feature","id","pval","qval","log2FC") #annotate with gene and transcript names results_transcripts = data.frame(geneNames=ballgown::geneNames(bg), geneIDs=ballgown::geneIDs(bg), transIDs=ballgown::transcriptNames(bg), stat_results) #Filter for significant DE transcripts sig<-(na.omit(results_transcripts[results_transcripts$qval < 0.05,])) #Write results to files write.table((sig[sig$log2FC <(-log2(1.5)),]), "downregulatedTranscripts1.5.tsv") write.table((sig[sig$log2FC >(log2(1.5)),]), "upregulatedTranscripts1.5.tsv") quit() y mkdir $output/ballgown mv $output/quant/transcriptGeneTable.txt $output/ballgown/ mv $output/quant/upregulatedTranscripts1.5.tsv $output/ballgown/ mv $output/quant/downregulatedTranscripts1.5.tsv $output/ballgown/ ############# Section 5 - Identifying isoform switching #Get the ids of the genes with DE isoforms from the gtf file cat $output/ballgown/upregulatedTranscripts1.5.tsv |cut -f 6 -d '"' |while read line; do grep -w $line $output/stringtie/merged.gtf |grep -m 1 ENSCGR |cut -f 2,4,6,8 -d '"' |awk -F'"' '{if ($4=="") print $1"\t"$2"\t"$3 ;else print $1"\t"$2"\t"$4}';done >$output/ballgown/upTransGenes.txt cat $output/ballgown/downregulatedTranscripts1.5.tsv |cut -f 6 -d '"' |while read line; do grep -w $line $output/stringtie/merged.gtf |grep -m 1 ENSCGR |cut -f 2,4,6,8 -d '"' |awk -F'"' '{if ($4=="") print $1"\t"$2"\t"$3 ;else print $1"\t"$2"\t"$4}';done >$output/ballgown/downTransGenes.txt # Find the genes with both up and down transcript isoforms DE mkdir $output/comparisons cut -f 3 $output/ballgown/downTransGenes.txt |sort |uniq > $output/ballgown/downGeneIds.txt cut -f 3 $output/ballgown/upTransGenes.txt |sort |uniq > $output/ballgown/upGeneIds.txt comm -12 $output/ballgown/downGeneIds.txt $output/ballgown/upGeneIds.txt >$output/comparisons/upAndDownTranscriptDE.txt #Find transcripts DE with no gene level DE comm -23 $output/ballgown/downGeneIds.txt $wd/../Chapter2/output/DEGs/downregulated1.5GeneIds.sorted.txt > $wd/output/comparisons/downIsoformOnly.txt comm -23 $output/ballgown/upGeneIds.txt $wd/../Chapter2/output/DEGs/upregulated1.5GeneIds.sorted.txt > $wd/output/comparisons/upIsoformOnly.txt #find the isoforms changing in both with no change at the gene level isoform switching cat ~/ThesisMaster/Chapter2/output/DEGs/upregulated1.5GeneIds.txt ~/ThesisMaster/Chapter2/output/DEGs/downregulated1.5GeneIds.txt |sort |uniq |comm -13 - $output/comparisons/upAndDownTranscriptDE.txt >$output/comparisons/isoformSwitching cat $output/comparisons/isoformSwitching |while read line ;do grep $line $output/stringtie/merged.gtf |grep -m 1 gene_name ; done |cut -f 2,6,8 -d '"' |sed 's/"/\t/g' > $output/comparisons/isoformSwitchingAnnotatedList.txt ############ Section 6 - RMATs #made the b1.txt and b2.txt files with the aligned bams from chapter 2 in the input directory and installed rmats mkdir $output/rmats python rMATS.4.0.2/rMATS-turbo-Linux-UCS4/rmats.py --b1 b1.txt --b2 b2.txt --gtf $output/stringtie/merged.gtf --od $output/rmats/ -t paired --nthread 70 --readLength 150 --tstat 70 --libType fr-secondstrand 2> $output/rmats/rmatsLog.txt mkdir output/rmatsResults/ #Return the hits we are confident in - those with under 0.05 FDR, a 10% change in the inclusion level. awk 'function abs(x){return ((x < 0.0) ? -x : x)} { if (($20 < 0.05) && (abs($23) > 0.1 )) { print } }' $output/rmats/SE.MATS.JC.txt > $output/rmatsResults/SE.MATS.JunctionCountOnly.txt awk 'function abs(x){return ((x < 0.0) ? -x : x)} { if (($20 < 0.05) && (abs($23) > 0.1 )) { print } }' $output/rmats/RI.MATS.JC.txt > $output/rmatsResults/RI.MATS.JunctionCountOnly.txt awk 'function abs(x){return ((x < 0.0) ? -x : x)} { if (($20 < 0.05) && (abs($23) > 0.1 )) { print } }' $output/rmats/A3SS.MATS.JC.txt > $output/rmatsResults/A3SS.MATS.JunctionCountOnly.txt awk 'function abs(x){return ((x < 0.0) ? -x : x)} { if (($20 < 0.05) && (abs($23) > 0.1 )) { print } }' $output/rmats/A5SS.MATS.JC.txt > $output/rmatsResults/A5SS.MATS.JunctionCountOnly.txt awk 'function abs(x){return ((x < 0.0) ? -x : x)} { if (($22 < 0.05) && (abs($25) > 0.1 )) { print } }' $output/rmats/MXE.MATS.JC.txt > $output/rmatsResults/MXE.MATS.JunctionCountOnly.txt #Annotate the geneIDs against the gtf cat $output/rmatsResults/A3SS.MATS.JunctionCountOnly.txt | while read line ; do echo $line | tr '\n' '\t'; echo $line | cut -f 2 -d '"' | while read geneID ; do grep -w $geneID $output/stringtie/merged.gtf |grep -m 1 ENSCGR | cut -f 2,4,6,8 -d '"' |awk -F'"' '{if ($4=="") print $3 ;else print $4}' |tr -d '\n' ; done; printf '\n' ; done > $output/rmatsResults/A3SS.annotated.txt ; cat $output/rmatsResults/A5SS.MATS.JunctionCountOnly.txt | while read line ; do echo $line | tr '\n' '\t'; echo $line | cut -f 2 -d '"' | while read geneID ; do grep -w $geneID $output/stringtie/merged.gtf |grep -m 1 ENSCGR | cut -f 2,4,6,8 -d '"' |awk -F'"' '{if ($4=="") print $3 ;else print $4}' |tr -d '\n' ; done; printf '\n' ; done > $output/rmatsResults/A5SS.annotated.txt ; cat $output/rmatsResults/RI.MATS.JunctionCountOnly.txt | while read line ; do echo $line | tr '\n' '\t'; echo $line | cut -f 2 -d '"' | while read geneID ; do grep -w $geneID $output/stringtie/merged.gtf |grep -m 1 ENSCGR | cut -f 2,4,6,8 -d '"' |awk -F'"' '{if ($4=="") print $3 ;else print $4}' |tr -d '\n' ; done; printf '\n' ; done > $output/rmatsResults/RI.annotated.txt ; cat $output/rmatsResults/MXE.MATS.JunctionCountOnly.txt | while read line ; do echo $line | tr '\n' '\t'; echo $line | cut -f 2 -d '"' | while read geneID ; do grep -w $geneID $output/stringtie/merged.gtf |grep -m 1 ENSCGR | cut -f 2,4,6,8 -d '"' |awk -F'"' '{if ($4=="") print $3 ;else print $4}' |tr -d '\n' ; done; printf '\n' ; done > $output/rmatsResults/MXE.annotated.txt ; cat $output/rmatsResults/SE.MATS.JunctionCountOnly.txt | while read line ; do echo $line | tr '\n' '\t'; echo $line | cut -f 2 -d '"' | while read geneID ; do grep -w $geneID $output/stringtie/merged.gtf |grep -m 1 ENSCGR | cut -f 2,4,6,8 -d '"' |awk -F'"' '{if ($4=="") print $3 ;else print $4}' |tr -d '\n' ; done; printf '\n' ; done > $output/rmatsResults/SE.annotated.txt ; #Find DEU without DGE grep -v -f $input/DEgenes.txt $output/rmatsResults/SE.annotated.txt > $output/rmatsResults/SE.DEUonly.annotated.txt grep -v -f $input/DEgenes.txt $output/rmatsResults/A5SS.annotated.txt > $output/rmatsResults/A5SS.DEUonly.annotated.txt grep -v -f $input/DEgenes.txt $output/rmatsResults/A3SS.annotated.txt > $output/rmatsResults/A3SS.DEUonly.annotated.txt grep -v -f $input/DEgenes.txt $output/rmatsResults/MXE.annotated.txt > $output/rmatsResults/MXE.DEUonly.annotated.txt grep -v -f $input/DEgenes.txt $output/rmatsResults/RI.annotated.txt > $output/rmatsResults/RI.DEUonly.annotated.txt # cat SE.DEUonly.annotated.txt |while read line ; do echo $line |awk -F' ' '{$NF="" ;print $0}' |tr '\n' '\t'; echo $line |awk -F' ' '{print $NF}' |while read gene ; do grep -w $gene $input/choPathwaysOfInterest.txt |tr '\n' '\t' ; done; printf '\n'; done |grep GO > SE.DEUonly.annotated.Pathways.txt cat A3SS.DEUonly.annotated.txt |while read line ; do echo $line |awk -F' ' '{$NF="" ;print $0}' |tr '\n' '\t'; echo $line |awk -F' ' '{print $NF}' |while read gene ; do grep -w $gene $input/choPathwaysOfInterest.txt |tr '\n' '\t' ; done; printf '\n'; done |grep GO > A3SS.DEUonly.annotated.Pathways.txt cat A5SS.DEUonly.annotated.txt |while read line ; do echo $line |awk -F' ' '{$NF="" ;print $0}' |tr '\n' '\t'; echo $line |awk -F' ' '{print $NF}' |while read gene ; do grep -w $gene $input/choPathwaysOfInterest.txt |tr '\n' '\t' ; done; printf '\n'; done |grep GO > A5SS.DEUonly.annotated.Pathways.txt cat MXE.DEUonly.annotated.txt |while read line ; do echo $line |awk -F' ' '{$NF="" ;print $0}' |tr '\n' '\t'; echo $line |awk -F' ' '{print $NF}' |while read gene ; do grep -w $gene $input/choPathwaysOfInterest.txt |tr '\n' '\t' ; done; printf '\n'; done |grep GO > MXE.DEUonly.annotated.Pathways.txt cat RI.DEUonly.annotated.txt |while read line ; do echo $line |awk -F' ' '{$NF="" ;print $0}' |tr '\n' '\t'; echo $line |awk -F' ' '{print $NF}' |while read gene ; do grep -w $gene $input/choPathwaysOfInterest.txt |tr '\n' '\t' ; done; printf '\n'; done |grep GO > RI.DEUonly.annotated.Pathways.txt #Split the results per GO ID per type of splice mkdir $output/rmatsResults/GO cat $input/choPathwaysOfInterest.txt |cut -f 3 -d ' ' |grep GO |sort |uniq |while read GOID;do awk -F'\t' '$0~/'$GOID'/ {print $1}' $output/rmatsResults/SE.DEUonly.annotated.Pathways.txt > $output/rmatsResults/GO/"$GOID".SE.txt;done cat $input/choPathwaysOfInterest.txt |cut -f 3 -d ' ' |grep GO |sort |uniq |while read GOID;do awk -F'\t' '$0~/'$GOID'/ {print $1}' $output/rmatsResults/A3SS.DEUonly.annotated.Pathways.txt > $output/rmatsResults/GO/"$GOID".A3SS.txt;done cat $input/choPathwaysOfInterest.txt |cut -f 3 -d ' ' |grep GO |sort |uniq |while read GOID;do awk -F'\t' '$0~/'$GOID'/ {print $1}' $output/rmatsResults/A5SS.DEUonly.annotated.Pathways.txt > $output/rmatsResults/GO/"$GOID".A5SS.txt;done cat $input/choPathwaysOfInterest.txt |cut -f 3 -d ' ' |grep GO |sort |uniq |while read GOID;do awk -F'\t' '$0~/'$GOID'/ {print $1}' $output/rmatsResults/RI.DEUonly.annotated.Pathways.txt > $output/rmatsResults/GO/"$GOID".RI.txt;done cat $input/choPathwaysOfInterest.txt |cut -f 3 -d ' ' |grep GO |sort |uniq |while read GOID;do awk -F'\t' '$0~/'$GOID'/ {print $1}' $output/rmatsResults/MXE.DEUonly.annotated.Pathways.txt > $output/rmatsResults/GO/"$GOID".MXE.txt;done #Clean up the empty files find $output/rmatsResults/GO -size 0 -delete ############### Rmats2sashimi cd $output/rmatsResults/GO #remove the 'chr' rmats adds to the chromosome name and turn spaces into tabs ls GO* |while read line ; do sed -i 's/ /\t/g' $line ; sed -i 's/chr//g' $line ;done cd ~/ThesisMaster/Chapter2/output/alignment python rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 ~/ThesisMaster/Chapter2/output/alignment/total-rna-1.sorted.bam --b2 ~/ThesisMaster/Chapter2/output/alignment/total-rna-5.sorted.bam -t SE -e GO:0001836.SE.txt --l1 NTS --l2 TS -o . mkdir $output/rmatsResults/GO/SEplots $output/rmatsResults/GO/RIplots $output/rmatsResults/GO/A3SSplots $output/rmatsResults/GO/A5SSplots $output/rmatsResults/GO/MXEplots ls /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/GO*SE.txt |while read line ;do mkdir "$line"plots; python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t SE -e $line --l1 NTS --l2 TS -o /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/SEplots --group-info $output/rmatsResults/GO/grp.gp --color "r","b" --intron_s 10 &done ls /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/GO*MXE.txt |while read line ;do mkdir "$line"plots; python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t MXE -e $line --l1 NTS --l2 TS -o /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/MXEplots --group-info $output/rmatsResults/GO/grp.gp --color "r","b" --intron_s 10 &done ls /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/GO*A3SS.txt |while read line ;do mkdir "$line"plots; python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t A3SS -e $line --l1 NTS --l2 TS -o /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/A3SSplots --group-info $output/rmatsResults/GO/grp.gp --color "r","b" --intron_s 10 &done ls /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/GO*A5SS.txt |while read line ;do mkdir "$line"plots; python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t A5SS -e $line --l1 NTS --l2 TS -o /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/A5SSplots --group-info $output/rmatsResults/GO/grp.gp --color "r","b" --intron_s 10 &done ls /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/GO*RI.txt |while read line ;do mkdir "$line"plots; python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t RI -e $line --l1 NTS --l2 TS -o /home/craig/ThesisMaster/Chapter3/output/rmatsResults/GO/RIplots --group-info $output/rmatsResults/GO/grp.gp --color "r","b" --intron_s 10 &done ############### Find protein domains R SE<-(read.table("SE.DEUonly.annotated.Pathways.rmats.txt")) library(biomaRt) listMarts() ensembl=useMart("ensembl") ensembl = useDataset("ccrigri_gene_ensembl",mart=ensembl) filters = listFilters(ensembl) attributes=listAttributes(ensembl) SE_domain_information<- getBM(attributes=c('ensembl_gene_id','external_gene_name','ensembl_exon_id','interpro','interpro_start','interpro_end','interpro_short_description','interpro_description'),filters= 'ensembl_exon_id', values = SE_exons_overlap$ensembl_exon_id,uniqueRows = T,mart = ensembl) # get availible exon information from ENSEMBL for the SE events of interest of non DE genes # scaffold id, strand, gene id, ensembl id, start and end position for each exon SE_exon_information<- getBM(attributes=c('chromosome_name','strand','ensembl_gene_id','external_gene_name','ensembl_exon_id','exon_chrom_start', 'exon_chrom_end'),filters = 'ensembl_gene_id', values = SE$V24,uniqueRows = T, mart = ensembl) # create granges object for exon information from ENSEMBL library(GenomicRanges) gr.exons <- GRanges(seqnames = SE_exon_information$chromosome_name, ranges = IRanges(start = SE_exon_information$exon_chrom_start, end = SE_exon_information$exon_chrom_end, names = SE_exon_information$ensembl_exon_id)) # create granges object for exon information from selected rmats output gr.rmats.exons <- GRanges(seqnames = SE$V4, ranges = IRanges(start = SE$V6, end = SE$V7, names = SE$V1), strand= SE$V5) # compare exons to determine which ones are skipped. Using exact overlap option in findOverlaps SE_overlap<-findOverlaps(gr.rmats.exons, gr.exons) SE_exons_overlap <- cbind(SE_exon_information[SE_overlap@to,], SE[SE_overlap@from,]) # now use the ENSEMBL exon Ids to get the affected domains SE_domain_information<-getBM(attributes=c('ensembl_gene_id','external_gene_name','ensembl_exon_id','interpro','interpro_start','interpro_end','interpro_short_description','interpro_description'),filters= 'ensembl_exon_id', values = SE_exons_overlap$ensembl_exon_id,uniqueRows = T,mart = ensembl) write.table(SE_domain_information, "skippedExonsWithOverlappingDomainNonDeGenesPathways.txt") # repeat domain annotation for all SE events - same code as above except we are not working on the all skipped exons. SE<-(read.table("SE.DEUonly.annotatedGeneIDs.txt")) SE_exon_information<- getBM(attributes=c('chromosome_name','strand','ensembl_gene_id','external_gene_name','ensembl_exon_id','exon_chrom_start', 'exon_chrom_end'), filters = 'ensembl_gene_id', values = SE[,24], uniqueRows = T, mart = ensembl) gr.exons <- GRanges(seqnames = SE_exon_information$chromosome_name, ranges = IRanges(start = SE_exon_information$exon_chrom_start, end = SE_exon_information$exon_chrom_end, names = SE_exon_information$ensembl_exon_id),strand=SE_exon_information$strand) gr.rmats.exons <- GRanges(seqnames = as.character(SE$V4), ranges = IRanges(start = SE$V6, end = SE$V7, names = as.character(SE$V1))) SE_overlap<-findOverlaps(gr.rmats.exons, gr.exons) SE_exons_overlap <- cbind(SE_exon_information[SE_overlap@to,], SE[SE_overlap@from,]) SE_domain_information<- getBM(attributes=c('ensembl_gene_id','external_gene_name','ensembl_exon_id','interpro','interpro_start','interpro_end','interpro_short_description','interpro_description'),filters= 'ensembl_exon_id', values = SE_exons_overlap$ensembl_exon_id,uniqueRows = T,mart = ensembl) write.table(SE_domain_information, "skippedExonsWithOverlappingDomainNonDeGenesAll.txt") quit() y #################### Section 7 - Figures #transcripts plot library(reshape2) library(ggplot2) transcripts<-(read.table("noTranscripts.tsv")) transcripts<-as.data.frame(melt (as.matrix(transcripts)) colnames(transcripts)<-c("Transcripts","GTF","Genes") ggplot(transcripts,aes(Transcripts,Genes,fill=GTF))+ geom_bar(stat="identity",position="dodge") #Sashimi cd ~/ThesisMaster/Chapter3/output/rmatsResults grep MSTRG.20426 A5SS.MATS.JunctionCountOnly.txt |sed 's/chr//g'> ~/ThesisMaster/Chapter2/output/alignment/Sc5d_A5SS.txt grep MSTRG.20426 A3SS.MATS.JunctionCountOnly.txt|sed 's/chr//g' > ~/ThesisMaster/Chapter2/output/alignment/Sc5d_A3SS.txt grep MSTRG.12400 A3SS.MATS.JunctionCountOnly.txt |sed 's/chr//g'>~/ThesisMaster/Chapter2/output/alignment/Ghdc_A3SS.txt grep MSTRG.23323 RI.MATS.JunctionCountOnly.txt |sed 's/chr//g'> ~/ThesisMaster/Chapter2/output/alignment/bcl2l12.txt cd ~/ThesisMaster/Chapter2/output/alignment/ MSTRG.15562 cd ~/ThesisMaster/Chapter3/output/rmatsResults grep MSTRG.15562 MXE.MATS.JunctionCountOnly.txt |sed 's/chr//g'> ~/ThesisMaster/Chapter2/output/alignment/ppargMXE.txt grep MSTRG.15562 SE.MATS.JunctionCountOnly.txt |sed 's/chr//g'> ~/ThesisMaster/Chapter2/output/alignment/ppargSE.txt cd ~/ThesisMaster/Chapter2/output/alignment/ mkdir ppargMXE ppargSE python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t MXE -e ppargMXE.txt --l1 NTS --l2 TS -o ./ppargSashimiPlotMXE --group-info grp.gp --color "r","b" --intron_s 10 python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t SE -e ppargSE.txt --l1 NTS --l2 TS -o ./ppargSashimiPlotSE --group-info grp.gp --color "r","b" --intron_s 10 cp ../../../Chapter3/output/rmatsResults/GO/grp.gp . mkdir SashimiPlot GhdcSashimiPlot Sc5d3SashimiPlot BCL2l12SashimiPlot python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t A5SS -e Sc5d_A5SS.txt --l1 NTS --l2 TS -o ./SashimiPlot --group-info grp.gp --color "r","b" --intron_s 10 python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t A3SS -e Sc5d_A3SS.txt --l1 NTS --l2 TS -o ./Sc5d3SashimiPlot --group-info grp.gp --color "r","b" --intron_s 10 python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t A3SS -e Ghdc_A3SS.txt --l1 NTS --l2 TS -o ./GhdcSashimiPlot --group-info grp.gp --color "r","b" --intron_s 10 python ../../../Chapter3/output/rmatsResults/GO/rmats2sashimiplot/src/rmats2sashimiplot/rmats2sashimiplot.py --b1 total-rna-1.sorted.bam,total-rna-2.sorted.bam,total-rna-3.sorted.bam,total-rna-4.sorted.bam --b2 total-rna-5.sorted.bam,total-rna-6.sorted.bam,total-rna-7.sorted.bam,total-rna-8.sorted.bam -t A3SS -e bcl2l12.txt --l1 NTS --l2 TS -o ./BCL2l12SashimiPlot --group-info grp.gp --color "r","b" --intron_s 10 #splicing pie splicingDF<-data.frame(SpliceType= c("SE","RI","A5SS","A3SS","MXE"), DifferentialSpliceEvents = c(1949,179,325,252,453)) bp <- ggplot (splicingDF, aes(x="", y=DifferentialSpliceEvents, fill=SpliceType))+ geom_bar(stat = "identity") pie <- bp + coord_polar("y", start=0) blank_theme <- theme_minimal()+ theme( axis.title.x = element_blank(), axis.title.y = element_blank(), panel.border = element_blank(), panel.grid=element_blank(), axis.ticks = element_blank(), plot.title=element_text(size=14, face="bold") ) pie + blank_theme + theme(axis.text.x=element_blank()) + geom_text(aes(y = DifferentialSpliceEvents/3 + c(0, cumsum(DifferentialSpliceEvents)[-length(DifferentialSpliceEvents)]), label = percent(DifferentialSpliceEvents/100)), size=5) ###PCRplot library(ggplot2) library(gridExtra) actbData<-as.data.frame(read.table("qPcrActb2.txt",header=TRUE)) actbData$conditionF<-factor(actbData$condition,levels=c("NTS","TS")) p<-ggplot(actbData,aes(x=as.factor(Gene),y=Mean, fill=conditionF)) + geom_bar(position=position_dodge(),stat="identity",colour="black") + geom_errorbar(aes(ymin=Mean-SE,ymax=Mean+SE,width=.2), position=position_dodge(0.9),stat="identity") + scale_fill_manual(values=c("tomato2","skyblue1")) + xlab(label=NULL) + ylab(label="Delta CT")+ guides(fill=guide_legend(title="Condition")) + scale_y_continuous(limits=c(0,15)) + theme(text = element_text(size=15)) gusbData<-as.data.frame(read.table("qPcrGusb.txt",header=TRUE)) gusbData$conditionF<-factor(gusbData$condition,levels=c("NTS","TS")) q<-ggplot(gusbData,aes(x=as.factor(Gene),y=Mean, fill=conditionF)) + geom_bar(position=position_dodge(),stat="identity",colour="black") + geom_errorbar(aes(ymin=Mean-SE,ymax=Mean+SE,width=.2), position=position_dodge(0.9),stat="identity") + scale_fill_manual(values=c("tomato2","skyblue1")) + xlab(label="Transcript Isoform") + ylab(label="Delta CT")+ guides(fill=guide_legend(title="Condition")) + scale_y_continuous(limits=c(-2.5,9)) + theme(text = element_text(size=15)) grid.arrange(p,q,ncol=1) ggplot(gusbData,aes(x=as.factor(CellLine),y=Mean, fill=conditionF)) + geom_bar(position=position_dodge(),stat="identity",colour="black") + geom_errorbar(aes(ymin=Mean-SE,ymax=Mean+SE,width=.2), position=position_dodge(0.9),stat="identity") + scale_fill_manual(values=c("tomato2","skyblue1")) + xlab(label="Cell Line") + ylab(label="Delta CT")+ guides(fill=guide_legend(title="Condition")) + scale_y_continuous(limits=c(-2.5,9)) + theme(text = element_text(size=15)) + facet_wrap(~Gene) actbData<-as.data.frame(read.table("qPcrActb.txt",header=TRUE)) actbData$conditionF<-factor(actbData$condition,levels=c("NTS","TS")) p<-ggplot(actbData,aes(x=as.factor(CellLine),y=Mean, fill=conditionF)) + geom_bar(position=position_dodge(),stat="identity",colour="black") + geom_errorbar(aes(ymin=Mean-SE,ymax=Mean+SE,width=.2), position=position_dodge(0.9),stat="identity") + scale_fill_manual(values=c("tomato2","skyblue1")) + ylab(label="Delta CT")+ guides(fill=guide_legend(title="Condition")) + scale_y_continuous(limits=c(0,15)) + theme(text = element_text(size=15)) + facet_wrap(~Gene)+ xlab("Cell Line") ############################################# # Thesis Chapter 4 ############################################# cd ~/ThesisMaster mkdir Chapter4 && cd Chapter4 mkdir input output preprocessed export wd=~/ThesisMaster/Chapter4 export output=$wd/output export genome=~/ThesisMaster/Chapter2/genome export input=$wd/input export CPU=70 ################## Section 1 - Preparing Inputs #Download mirbase wget ftp://mirbase.org/pub/mirbase/CURRENT/mature.fa.gz wget ftp://mirbase.org/pub/mirbase/CURRENT/hairpin.fa.gz gunzip * #get the cho and orthologous sequences mkdir $input/mirBaseSequences grep -A 1 'cgr' mature.fa | sed s/--//g | sed '/^\s*$/d' > $input/mirBaseSequences/cgr_mature.fa ## linearize fasta sequences, select 245 cgr hairpins, remove '--' and empty line sed -e 's/\(^>.*$\)/#\1#/' hairpin.fa | tr -d "\r" | tr -d "\n" | sed -e 's/$/#/' | tr "#" "\n" | sed -e '/^$/d' | grep '^>cgr' -A 1 | sed s/--//g | sed '/^\s*$/d' > $input/mirBaseSequences/cgr_hairpin.fa #get the orthologs for all mammalian species 'well annotated' with over 500 pre-miRNAs or mature miRNAs grep -A 1 '^>mmu\|^>hsa\|^>rno\|^>cpo\|^>mdo\|^>bta\|^>mml\|^>eca\|^>pal\|^>ppy\|^>oan\|^>dno\|^>cfa\|^>ocu\|^>cja\|^>ptr' mature.fa |sed s/--//g | sed '/^\s*$/d' > $input/mirBaseSequences/ortho_mature.fa grep '^>' $input/mirBaseSequences/ortho_mature.fa | cut -f 1 -d '-' |sort |uniq -c |sort -r #remove whitespaces for mirdeep perl -plane 's/\s+.+$//' < $input/mirBaseSequences/cgr_mature.fa > $input/mirBaseSequences/cgr_mature_mod.fa perl -plane 's/\s+.+$//' < $input/mirBaseSequences/ortho_mature.fa > $input/mirBaseSequences/ortho_mature_mod.fa perl -plane 's/\s+.+$//' < $input/mirBaseSequences/cgr_hairpin.fa > $input/mirBaseSequences/cgr_hairpin_mod.fa mv *.fa input/mirBaseSequences/ #link the raw data mkdir $input/rawdata sudo ln -s /data/SIRG/SmallRNA/small-rna-* $input/rawdata #Create samplefile ls $input/rawdata |cut -f 1 -d '_' >sampleFile.txt export samples=$wd/sampleFile.txt #Prepare bowtie index bowtie-build /home/craig/ThesisMaster/Chapter2/genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa /home/craig/ThesisMaster/Chapter2/genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa -p $CPU ################# Section 2 - Preprocessing Reads #Trim the adapter sequences while read sample_name; do cutadapt -m 10 -a TGGAATTCTCGG $input/rawdata/"$sample_name"_sequence.txt.gz -o $wd/preprocessed/"$sample_name"_adapter_trimmed.txt.gz& done < $samples #Trim the flanking barcodes while read sample_name; do cutadapt --format=fastq -m 10 --cut=4 --cut=-4 $wd/preprocessed/"$sample_name"_adapter_trimmed.txt.gz -o $wd/preprocessed/"$sample_name"_flank_trimmed.txt.gz& done < $samples #size selection to enrich for miRNA while read sample_name; do cutadapt --format=fastq -m 17 -M 25 $wd/preprocessed/"$sample_name"_flank_trimmed.txt.gz -o $wd/preprocessed/"$sample_name"_sequence.txt.gz& done < $samples #unzip and rename while read sample_name; do gunzip $wd/preprocessed/"$sample_name"_sequence.txt.gz mv $input/preprocessed/"$sample_name"_sequence.txt $input/preprocessed/"$sample_name".fq done < $samples ################# Section 3 - miRNA detection mkdir $output/mirDeepOutput #made a sample list for mirdeep called sample_list - ''' /home/craig/ThesisMaster/Chapter4/preprocessed/small-rna-1.fq NS1 /home/craig/ThesisMaster/Chapter4/preprocessed/small-rna-2.fq NS2 /home/craig/ThesisMaster/Chapter4/preprocessed/small-rna-3.fq NS3 /home/craig/ThesisMaster/Chapter4/preprocessed/small-rna-4.fq NS4 /home/craig/ThesisMaster/Chapter4/preprocessed/small-rna-5.fq TS1 /home/craig/ThesisMaster/Chapter4/preprocessed/small-rna-6.fq TS2 /home/craig/ThesisMaster/Chapter4/preprocessed/small-rna-7.fq TS3 /home/craig/ThesisMaster/Chapter4/preprocessed/small-rna-8.fq TS4 ''' #Run the mapper script to prepare the reads.arf file for mirdeep input mapper.pl sample_list -d -e -h -j -l 18 -m -p $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa -t $output/mirDeepOutput/reads_v_genome.arf -v -s $output/mirDeepOutput/processed_reads.fa #Run the main miRDeep script miRDeep2.pl $output/mirDeepOutput/processed_reads.fa $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa $output/mirDeepOutput/reads_v_genome.arf $input/mirBaseSequences/cgr_mature_mod.fa $input/mirBaseSequences/ortho_mature_mod.fa $input/mirBaseSequences/cgr_hairpin_mod.fa -P 2>$output/mirDeepOutput/report.log ################### Section 4 - quantify miRNA expression mkdir $output/mapped # map to ITSR & rRNA cat sampleFile.txt |while read sample ; do bowtie -v 0 -p 70 -S itsr.fa preprocessed/preprocessedData/"$sample".fq --un itsr2/"$sample".unaligned.fa ; done >/dev/null cat sampleFile.txt |while read sample ; do bowtie -v 0 -p 70 -S ../Chapter2/genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa itsr2/"$sample".unaligned.fa ; done >itsr2/"$sample".sam # map preprocessed reads to ensembl CHOK1 genone, 0 mismatches allowed while read sampleName; do bowtie -v 0 -p 32 -S $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa $wd/preprocessed/"$sampleName".fq > $output/mapped/"$sampleName".sam samtools view -@ $CPU -bS $output/mapped/"$sampleName".sam | samtools sort -@ CPU - -o $output/mapped/"$sampleName".bam rm $output/mapped/"$sampleName".sam samtools index $output/mapped/"$sampleName".bam done < $samples #Count against miRDeep bed file mkdir $output/bedtools_counts bedtools multicov -bams $output/mapped/*.bam -bed result_*_08_2018_t_*.bed > $output/bedtools_counts/multicov_counts.bed ################### Section 5 - differential miR expression mkdir $output/DEmirs R library(DESeq2) counts<-read.table("output/bedtools_counts/multicov_counts.bed",header=F, row.names=4) counts<-counts[,9:16] colnames(counts) <- c("NTS1","NTS2","NTS3","NTS4","TS1","TS2","TS3","TS4") my.design <- data.frame(row.names = colnames( counts ), condition = c("NTS","NTS","NTS","NTS","TS","TS","TS","TS") ) DESeq_data <- DESeqDataSetFromMatrix(countData = counts, colData = my.design, design = ~condition) DESeq_data$condition <- relevel( DESeq_data$condition, ref="NTS") keep <- rowSums(counts(DESeq_data)) >= 10 DESeq_data <- DESeq_data[keep,] dds <- DESeq(DESeq_data) res <- results(dds, name="condition_TS_vs_NTS",lfcThreshold=0) sig_de_results <-subset(res, padj < 0.05) sig_de_results <- sig_de_results[order(sig_de_results$log2FoldChange, decreasing=T),] write.csv(as.data.frame(sig_de_results), file="output/DEmirs/condition_TS_vs_NS.csv") quit() y #Combine DE results with miRDeep results awk -F'",' '{print$1}' $output/DEmirs/condition_TS_vs_NS.csv |awk -F: '{print $2}' |sed -e '/^$/d'|while read line ;do fgrep -w "$line" result_03_08_2018_t_14_23_35.csv ; done > $output/DEmirs/novelDEmirs.tsv paste $output/DEmirs/novelDEmirs.tsv $output/DESeq2/condition_TS_vs_NS.csv >$output/DEmirs/Results.tsv #opened in excel to format correctly and insert headers #Filter results for score >10 and over 1.2 fold expression awk '{if ($21 >= 0.26 && $2 >= 10) print $0 }' $output/DEmirs/Results.tsv > $output/DEmirs/upMir1.2Results.txt ################### Section 6 - Target prediction with multiMir mkdir $output/multiMir # created a list of mirbase accession codes for the up and down miRNAs and saved in $output/multimir/[up|down]_miRNAs.txt #manually ensured they had the same seed as the DE cho mirna R library(biomaRt) library(multiMiR) #Retrieve ensembl IDs from biomart for cho ensembl=useMart("ensembl") ensembl = useDataset("ccrigri_gene_ensembl",mart=ensembl) filters = listFilters(ensembl) attributes=listAttributes(ensembl) #Load the DE genes cgr_down_gene_ids<-readLines(con = "../Chapter2/output/DEGs/downregulated1.5GeneIds.txt") cgr_up_gene_ids<-readLines(con = "../Chapter2/output/DEGs/upregulated1.5GeneIds.txt") #Load the DE miRS up_known_mirna_ids<-read.table("output/multiMir/up_miRNAs.txt") down_known_mirna_ids<-read.table("output/multiMir/down_miRNAs.txt") #Annotate the up and down lists against human mouse and rat ids #retrieve human hsa_down_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = cgr_down_gene_ids, uniqueRows = F, mart = ensembl) hsa_down_homologs<-hsa_down_homologs[!duplicated(hsa_down_homologs$ensembl_gene_id),] #retrieve mouse, for those with no human id mmu_down_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','mmusculus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_down_homologs[hsa_down_homologs[,3] == "",1], uniqueRows = T, mart = ensembl) mmu_down_homologs<-mmu_down_homologs[!duplicated(mmu_down_homologs$ensembl_gene_id),] mmu_ortholog<-mmu_down_homologs[mmu_down_homologs[,3]!= "",c(1,3)] rownames(hsa_down_homologs)<-hsa_down_homologs[,1] #Loop through the table and append mouse ids for (i in 1:nrow(mmu_ortholog)){hsa_down_homologs[as.character(mmu_ortholog[i,1]),3]<-mmu_ortholog[i,2] } rno_down_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','rnorvegicus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_down_homologs[hsa_down_homologs[,3] == "",1], uniqueRows = F, mart = ensembl) rno_down_homologs<-rno_down_homologs[!duplicated(rno_down_homologs$ensembl_gene_id),] rno_ortholog<-rno_down_homologs[rno_down_homologs[,3]!= "",c(1,3)] rownames(hsa_down_homologs)<-hsa_down_homologs[,1] #Loop through the table and append rat ids for (i in 1:nrow(rno_ortholog)){hsa_down_homologs[as.character(rno_ortholog[i,1]),3]<-rno_ortholog[i,2] } #Repeat for up genes hsa_up_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = cgr_up_gene_ids, uniqueRows = F, mart = ensembl) hsa_up_homologs<-hsa_up_homologs[!duplicated(hsa_up_homologs$ensembl_gene_id),] #retrieve mouse, for those with no human id mmu_up_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','mmusculus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_up_homologs[hsa_up_homologs[,3] == "",1], uniqueRows = T, mart = ensembl) mmu_up_homologs<-mmu_up_homologs[!duplicated(mmu_up_homologs$ensembl_gene_id),] mmu_ortholog<-mmu_up_homologs[mmu_up_homologs[,3]!= "",c(1,3)] rownames(hsa_up_homologs)<-hsa_up_homologs[,1] #Loop through the table and append mouse ids for (i in 1:nrow(mmu_ortholog)){hsa_up_homologs[as.character(mmu_ortholog[i,1]),3]<-mmu_ortholog[i,2] } rno_up_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','rnorvegicus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_up_homologs[hsa_up_homologs[,3] == "",1], uniqueRows = F, mart = ensembl) rno_up_homologs<-rno_up_homologs[!duplicated(rno_up_homologs$ensembl_gene_id),] rno_ortholog<-rno_up_homologs[rno_up_homologs[,3]!= "",c(1,3)] rownames(hsa_up_homologs)<-hsa_up_homologs[,1] #Loop through the table and append rat ids for (i in 1:nrow(rno_ortholog)){hsa_up_homologs[as.character(rno_ortholog[i,1]),3]<-rno_ortholog[i,2] } #Run the multimir target prediction for up and down vs both up and down and retrieve all validated targets multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "validated", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "validated", summary = TRUE , predicted.cutoff.type="p", predicted.cutoff=99) multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "validated", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "validated", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirValidated.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirValidated.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirValidated.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirValidated.tsv") #Return all conserved sites multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "miranda", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "miranda", summary = TRUE , predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "miranda", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "miranda", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirMiranda.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirMiranda.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirMiranda.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirMiranda.tsv") multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "pita", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "pita", summary = TRUE , predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "pita", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "pita", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirpita.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirpita.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirpita.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirpita.tsv") multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "targetscan", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "targetscan", summary = TRUE , predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "targetscan", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "targetscan", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirtargetscan.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirtargetscan.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirtargetscan.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirtargetscan.tsv") #Return top 20% of hits of predicted sites multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "predicted", summary = TRUE) multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "predicted", summary = TRUE ) multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "predicted", summary = TRUE) multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "predicted", summary = TRUE) write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirPredicted.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirPredicted.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirPredicted.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirPredicted.tsv") quit() y #Cat all the conserved results into one table sed -i 1d upMrnaUpMirpita.tsv sed -i 1d upMrnaDownMirpita.tsv sed -i 1d downMrnaUpMirpita.tsv sed -i 1d downMrnaDownMirpita.tsv sed -i 1d upMrnaUpMirtargetscan.tsv sed -i 1d upMrnaDownMirtargetscan.tsv sed -i 1d downMrnaUpMirtargetscan.tsv sed -i 1d downMrnaDownMirtargetscan.tsv cat upMrnaUpMirMiranda.tsv upMrnaUpMirpita.tsv upMrnaUpMirtargetscan.tsv > upMrnaUpMirConserved.tsv cat upMrnaDownMirMiranda.tsv upMrnaDownMirpita.tsv upMrnaDownMirtargetscan.tsv > upMrnaDownMirConserved.tsv cat downMrnaUpMirMiranda.tsv downMrnaUpMirpita.tsv downMrnaUpMirtargetscan.tsv > downMrnaUpMirConserved.tsv cat downMrnaDownMirMiranda.tsv downMrnaDownMirpita.tsv downMrnaDownMirtargetscan.tsv > downMrnaDownMirConserved.tsv rm *pita.tsv *targetscan.tsv *miranda.tsv ################################################################# #Filter targets for all validate and those in targetscan +1 mkdir targets #get the validated targets cut -f 6,8,10,14 -d '"' upMrnaDownMirValidated.tsv |sed 1d|sed 's/"/\t/g'|sort -k 2,2 |uniq > targets/upMrnaDownMirValidated.tsv cut -f 6,8,10,14 -d '"' downMrnaDownMirValidated.tsv |sed 1d|sed 's/"/\t/g'|sort -k 2,2 |uniq > targets/downMrnaDownMirValidated.tsv cut -f 6,8,10,14 -d '"' upMrnaUpMirValidated.tsv |sed 1d|sed 's/"/\t/g'|sort -k 2,2 |uniq > targets/upMrnaUpMirValidated.tsv cut -f 6,8,10,14 -d '"' downMrnaUpMirValidated.tsv |sed 1d|sed 's/"/\t/g'|sort -k 2,2 |uniq > targets/downMrnaUpMirValidated.tsv #get the targetscan targets cat upMrnaDownMirPredicted.tsv upMrnaDownMirConserved.tsv| grep -i targetscan |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort| uniq > upMrnaDownMirTargetScan.tsv cat upMrnaUpMirPredicted.tsv upMrnaUpMirConserved.tsv| grep -i targetscan |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort| uniq > upMrnaUpMirTargetScan.tsv cat downMrnaDownMirPredicted.tsv downMrnaDownMirConserved.tsv| grep -i targetscan |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort| uniq > downMrnaDownMirTargetScan.tsv cat downMrnaUpMirPredicted.tsv downMrnaUpMirConserved.tsv| grep -i targetscan |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort| uniq > downMrnaUpMirTargetScan.tsv #non targetscan cat upMrnaDownMirPredicted.tsv upMrnaDownMirConserved.tsv| grep -v -i targetscan |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort| uniq > upMrnaDownMirNonTargetScan.tsv cat upMrnaUpMirPredicted.tsv upMrnaUpMirConserved.tsv| grep -v -i targetscan |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort| uniq > upMrnaUpMirNonTargetScan.tsv cat downMrnaDownMirPredicted.tsv downMrnaDownMirConserved.tsv| grep -v -i targetscan |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort| uniq > downMrnaDownMirNonTargetScan.tsv cat downMrnaUpMirPredicted.tsv downMrnaUpMirConserved.tsv| grep -v -i targetscan |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort| uniq > downMrnaUpMirNonTargetScan.tsv #overlap targetscan and nontargetscan comm -12 downMrnaDownMirTargetScan.tsv downMrnaDownMirNonTargetScan.tsv>targets/downMrnaDownMirTargetScanPlus1.tsv comm -12 upMrnaDownMirTargetScan.tsv upMrnaDownMirNonTargetScan.tsv>targets/upMrnaDownMirTargetScanPlus1.tsv comm -12 upMrnaUpMirTargetScan.tsv upMrnaUpMirNonTargetScan.tsv>targets/upMrnaUpMirTargetScanPlus1.tsv comm -12 downMrnaUpMirTargetScan.tsv downMrnaUpMirNonTargetScan.tsv>targets/downMrnaUpMirTargetScanPlus1.tsv cat downMrnaDownMirTargetScanPlus1.tsv downMrnaDownMirValidated.tsv |sort |uniq > downMrnaDownMir.tsv cat downMrnaUpMirTargetScanPlus1.tsv downMrnaUpMirValidated.tsv |sort |uniq > downMrnaUpMir.tsv cat upMrnaDownMirTargetScanPlus1.tsv upMrnaDownMirValidated.tsv |sort |uniq > upMrnaDownMir.tsv cat upMrnaUpMirTargetScanPlus1.tsv upMrnaUpMirValidated.tsv |sort|uniq > upMrnaUpMir.tsv ## Plotting #File plotData.txt created taking the numbers of Validated, Conserved and Predicted Mir targets (ensured no duplication of predicted and conserved) #Appendix 4F R library(ggplot2) library(reshape2) plotData<-read.table ("plotData.txt", header=TRUE) temp <- plotData[,-2:-3] melted <-melt(temp,id=c("hsa.miR","Genes")) colnames(melted)<-c("hsa.miR","Differentially_Expressed_Genes","Evidence","Targets") ggplot(melted, aes(x = Differentially_Expressed_Genes, y = Targets, fill = Evidence)) + geom_bar(stat = 'identity', position = 'stack') + facet_grid(~ hsa.miR)+ scale_fill_manual(values=c("springgreen3","chocolate1","tomato2")) #per database plot #Appendix 4G genes per database stacked<-read.table("targetDatabase.txt",header=TR melted<-melt(stacked,id=c("Database","Expression")) colnames(melted)<-c("Database","Expression","Evidence","Targets") ggplot(melted, aes(x = Expression, y = Targets, fill = Evidence)) + geom_bar(stat = 'identity', position = 'stack') + facet_grid(~ Database)+ scale_fill_manual(values=c("springgreen3","chocolate1","tomato2")) ################### Section 9 - Overlapping miR targets with proteomics results cd $output/multiMir cat downMrnaDownMirValidated.tsv downMrnaDownMirConserved.tsv downMrnaDownMirPredictedOnly.txt > downMrnaDownMir.tsv cat downMrnaUpMirValidated.tsv downMrnaUpMirConserved.tsv downMrnaUpMirPredictedOnly.txt > downMrnaUpMir.tsv cat upMrnaDownMirValidated.tsv upMrnaDownMirConserved.tsv upMrnaDownMirPredictedOnly.txt >upMrnaDownMir.tsv cat upMrnaUpMirValidated.tsv upMrnaUpMirConserved.tsv upMrnaUpMirPredictedOnly.txt >upMrnaUpMir.tsv cat upMrnaUpMir.tsv |grep ENSG |cut -f 6,14 -d '"' |sed 's/"/\t/g'|sort |uniq > upMrnaUpMirAllTargets.tsv cat upMrnaDownMir.tsv |grep ENSG |cut -f 6,14 -d '"' |sed 's/"/\t/g'|sort |uniq > upMrnaDownMirAllTargets.tsv cat downMrnaUpMir.tsv |grep ENSG |cut -f 6,14 -d '"' |sed 's/"/\t/g'|sort |uniq > downMrnaUpMirAllTargets.tsv cat downMrnaDownMir.tsv |grep ENSG |cut -f 6,14 -d '"' |sed 's/"/\t/g'|sort |uniq > downMrnaDownMirAllTargets.tsv cd $wd mkdir $output/proteomicsOverlap cat ../Chapter2/output/proteomicsOverlap/upGenesDetectedInButNotDeProteomics.txt ../Chapter2/output/proteomicsOverlap/upGeneDownProtein.txt |sort|uniq> $output/proteomicsOverlap/potentialUpMirTargets.txt cat ../Chapter2/output/proteomicsOverlap/downGenesDetectedInButNotDeProteomics.txt ../Chapter2/output/proteomicsOverlap/downGeneUpProtein.txt |sort|uniq > $output/proteomicsOverlap/potentialDownMirTargets.txt grep -f output/proteomicsOverlap/potentialUpMirTargets.txt CHO_Human_ENSG_key.txt |cut -f 3 |sort |uniq > output/proteomicsOverlap/potentialUpMirTargetsHuman.txt grep -f output/proteomicsOverlap/potentialDownMirTargets.txt CHO_Human_ENSG_key.txt |cut -f 3 |sort |uniq > output/proteomicsOverlap/potentialDownMirTargetsHuman.txt ############Can up miRNAs affecting up mRNAs explain any of the lack of correlation at the proteomics level? Yes. A huge amount of it. #137 interactions for down down for 51 genes. 39 interactions 18 genes grep -f output/proteomicsOverlap/potentialUpMirTargetsHuman.txt output/multiMir/upMrnaUpMirAllTargets.tsv >output/proteomicsOverlap/upMrnaUpMirNoDeProtein.txt grep -f output/proteomicsOverlap/potentialDownMirTargetsHuman.txt output/multiMir/downMrnaDownMirAllTargets.tsv > output/proteomicsOverlap/downMrnaDownMirNoDeProtein.txt #These genes have down gene but up protein and a down mir targetting them cat ../Chapter2/output/proteomicsOverlap/downGeneUpProtein.txt |while read line ; do grep $line CHO_Human_ENSG_key.txt;done|cut -f 3 |sort|uniq| while read line ; do grep $line output/proteomicsOverlap/downMrnaDownMirNoDeProtein.txt ;done > $output/proteomicsOverlap/downGeneUpProteinDownMir.txt ########### Can the up/down proteins which arent de at the gene level be explained by miRNAs? mkdir $output/proteomicsOverlap/multimirProteins cp ../Chapter2/output/proteomicsOverlap/upProteinNotGene.txt output/proteomicsOverlap/multimirProteins/upProteinNotGene.txt cp ../Chapter2/output/proteomicsOverlap/downProteinNotGene.txt output/proteomicsOverlap/multimirProteins/downProteinNotGene.txt cd $output/proteomicsOverlap/multimirProteins$ R library(multiMiR) library(biomaRt) #Retrieve ensembl IDs from biomart for cho ensembl=useMart("ensembl") ensembl = useDataset("ccrigri_gene_ensembl",mart=ensembl) filters = listFilters(ensembl) attributes=listAttributes(ensembl) cgr_down_protein_ids<-readLines(con = "downProteinNotGene.txt") cgr_up_protein_ids<-readLines(con = "upProteinNotGene.txt") hsa_down_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = cgr_down_protein_ids, uniqueRows = F, mart = ensembl) hsa_up_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = cgr_up_protein_ids, uniqueRows = F, mart = ensembl) up_known_mirna_ids<-read.table("../../multiMir/up_miRNAs.txt") down_known_mirna_ids<-read.table("../../multiMir/down_miRNAs.txt") multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "validated", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "validated", summary = TRUE , predicted.cutoff.type="p", predicted.cutoff=99) multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "validated", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "validated", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirValidated.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirValidated.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirValidated.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirValidated.tsv") #Return all conserved sites multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "miranda", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "miranda", summary = TRUE , predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "miranda", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "miranda", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirMiranda.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirMiranda.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirMiranda.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirMiranda.tsv") multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "pita", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "pita", summary = TRUE , predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "pita", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "pita", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirpita.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirpita.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirpita.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirpita.tsv") multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "targetscan", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "targetscan", summary = TRUE , predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "targetscan", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "targetscan", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99,predicted.site="conserved") write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirtargetscan.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirtargetscan.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirtargetscan.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirtargetscan.tsv") #Return top 20% of hits of predicted sites multimir_results_31_down_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "predicted", summary = TRUE) multimir_results_31_up_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "predicted", summary = TRUE ) multimir_results_31_up_mrna_v_up_mir <- get_multimir(org= 'hsa', mirna = up_known_mirna_ids, target = hsa_up_homologs[hsa_up_homologs[,3] != "",3], table = "predicted", summary = TRUE) multimir_results_31_down_mrna_v_down_mir <- get_multimir(org= 'hsa', mirna = down_known_mirna_ids, target = hsa_down_homologs[hsa_down_homologs[,3] != "",3], table = "predicted", summary = TRUE) write.table(multimir_results_31_down_mrna_v_up_mir@data,"downMrnaUpMirPredicted.tsv") write.table(multimir_results_31_up_mrna_v_down_mir@data,"upMrnaDownMirPredicted.tsv") write.table(multimir_results_31_up_mrna_v_up_mir@data,"upMrnaUpMirPredicted.tsv") write.table(multimir_results_31_down_mrna_v_down_mir@data,"downMrnaDownMirPredicted.tsv") quit() y cat downMrnaDownMirMiranda.tsv downMrnaDownMirPredicted.tsv downMrnaDownMirValidated.tsv downMrnaDownMirpita.tsv downMrnaDownMirtargetscan.tsv |grep ENSG| cut -f 6,14 -d '"' |sed 's/"/\t/g'|sort |uniq >downProteinNotGeneDownMir.txt cat downMrnaUpMirMiranda.tsv downMrnaUpMirPredicted.tsv downMrnaUpMirValidated.tsv downMrnaUpMirpita.tsv downMrnaUpMirtargetscan.tsv |grep ENSG| cut -f 6,14 -d '"' |sed 's/"/\t/g'|sort |uniq > downProteinNotGeneUpMir.txt cat upMrnaDownMirMiranda.tsv upMrnaDownMirPredicted.tsv upMrnaDownMirValidated.tsv upMrnaDownMirpita.tsv upMrnaDownMirtargetscan.tsv |grep ENSG| cut -f 6,14 -d '"' |sed 's/"/\t/g'|sort |uniq > upProteinNotGeneDownMir.txt cat upMrnaUpMirMiranda.tsv upMrnaUpMirPredicted.tsv upMrnaUpMirValidated.tsv upMrnaUpMirpita.tsv upMrnaUpMirtargetscan.tsv |grep ENSG| cut -f 6,14 -d '"' |sed 's/"/\t/g'|sort |uniq >upProteinNotGeneUpMir.txt mkdir targets mv upProteinNotGeneDownMir.txt targets mv downProteinNotGeneUpMir.txt targets ##92/119 of the up proteins with no down gene and 74/99 down proteins have targets #### plots FC<-cbind(c("Eps8","Hat1","Lmnb2","H2afx","Eps8","Hat1","Lmnb2","H2afx"),c("Gene","Gene","Gene","Gene","Protein","Protein","Protein","Protein"),c(-1.89,-1.79,-1.69,-2.64,1.71,1.27,1.27,3.48)) colnames(FC)<-c("Gene","Experiment","FoldChange") FC<-as.data.frame(FC) FC$FoldChange=as.numeric(levels(FC$FoldChange))[FC$FoldChange] ggplot(FC,aes(Gene, FoldChange, fill=Experiment))+ geom_bar(stat="identity", position="dodge")+ scale_fill_brewer(palette="Set1") # mirProtein<-read.table("mirsprotein.txt",header=TRUE) ggplot(mirProtein, aes(x=miRNA, y=Genes,fill=miR.DE))+ geom_bar(stat="identity") ##################### #Section X- mirTarget prediction on de spliced, de transated and de proteins ##################### #Get a list of all the spliced genes, differentially translated genes, and proteomics genes cd ~/ThesisMaster/Chapter4/output/multiMir mkdir proteins translation splicing cat ~/ThesisMaster/Chapter6/downregulatedProteins.tsv ~/ThesisMaster/Chapter6/upregulatedProteins.tsv |cut -f 1 -d ' '|sort |uniq > proteins/deProteins.txt cat ~/ThesisMaster/Chapter6/splicing/allAltSplicedTranscripts.txt| sort |uniq > splicing/deSplicing.txt cat ~/ThesisMaster/Chapter5/output/overlapTE/14 > deTranslation.txt ############## Splicing cd splicing cat ../up_miRNAs.txt ../down_miRNAs.txt > deMirs.txt R library(biomaRt) library(multiMiR) ensembl=useMart("ensembl") ensembl = useDataset("ccrigri_gene_ensembl",mart=ensembl) filters = listFilters(ensembl) attributes=listAttributes(ensembl) cgr_spliced_genes<-readLines(con = "deSplicing.txt") deMirs<-readLines(con="deMirs.txt") #Retrieve human mouse and rat annotations hsa_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = cgr_spliced_genes, uniqueRows = F, mart = ensembl) hsa_homologs<-hsa_homologs[!duplicated(hsa_homologs$ensembl_gene_id),] #retrieve mouse, for those with no human id mmu_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','mmusculus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_homologs[hsa_homologs[,3] == "",1], uniqueRows = T, mart = ensembl) mmu_homologs<-mmu_homologs[!duplicated(mmu_homologs$ensembl_gene_id),] mmu_ortholog<-mmu_homologs[mmu_homologs[,3]!= "",c(1,3)] rownames(hsa_homologs)<-hsa_homologs[,1] #Loop through the table and append mouse ids for (i in 1:nrow(mmu_ortholog)){hsa_homologs[as.character(mmu_ortholog[i,1]),3]<-mmu_ortholog[i,2] } rno_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','rnorvegicus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_homologs[hsa_homologs[,3] == "",1], uniqueRows = F, mart = ensembl) rno_homologs<-rno_homologs[!duplicated(rno_homologs$ensembl_gene_id),] rno_ortholog<-rno_homologs[rno_homologs[,3]!= "",c(1,3)] rownames(hsa_homologs)<-hsa_homologs[,1] #Loop through the table and append rat ids for (i in 1:nrow(rno_ortholog)){hsa_homologs[as.character(rno_ortholog[i,1]),3]<-rno_ortholog[i,2] } #Run multimir to get all validated targets, conserved and 20% of top predicted validatedMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "validated", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) mirandaMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "miranda", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) pitaMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "pita", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) targetscanMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "targetscan", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) predictedMirs <- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "predicted", summary = TRUE) write.table(validatedMirs@data,"splicedTargetsValidated.tsv") write.table(mirandaMirs@data,"splicedTargetsMiranda.tsv") write.table(pitaMirs@data,"splicedTargetsPITA.tsv") write.table(targetscanMirs@data,"splicedTargetsTargetscan.tsv") write.table(predictedMirs@data,"splicedTargetsPredicted.tsv") quit() y grep targetscan splicedTargetsPredicted.tsv |cat - splicedTargetsTargetscan.tsv |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort|uniq>targetScan.tmp grep -v targetscan splicedTargetsPredicted.tsv |cat - splicedTargetsPITA.tsv splicedTargetsMiranda.tsv |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort -r |uniq|sed 1d |sort > nontargetscan.tmp comm -12 nontargetscan.tmp targetScan.tmp >targetPlus1.tmp cut -f 6,8,10,14 -d '"' splicedTargetsValidated.tsv |sed 's/"/\t/g' |sed 1d |cat - targetPlus1.tmp |sort |uniq >splicedTargetsDeMirs.txt cd .. ############## proteomics cd proteins R library(biomaRt) library(multiMiR) ensembl=useMart("ensembl") ensembl = useDataset("ccrigri_gene_ensembl",mart=ensembl) filters = listFilters(ensembl) attributes=listAttributes(ensembl) cgr_proteins_genes<-readLines(con = "deProteins.txt") deMirs<-readLines(con="../splicing/deMirs.txt") #Retrieve human mouse and rat annotations hsa_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = cgr_proteins_genes, uniqueRows = F, mart = ensembl) hsa_homologs<-hsa_homologs[!duplicated(hsa_homologs$ensembl_gene_id),] #retrieve mouse, for those with no human id mmu_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','mmusculus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_homologs[hsa_homologs[,3] == "",1], uniqueRows = T, mart = ensembl) mmu_homologs<-mmu_homologs[!duplicated(mmu_homologs$ensembl_gene_id),] mmu_ortholog<-mmu_homologs[mmu_homologs[,3]!= "",c(1,3)] rownames(hsa_homologs)<-hsa_homologs[,1] #Loop through the table and append mouse ids for (i in 1:nrow(mmu_ortholog)){hsa_homologs[as.character(mmu_ortholog[i,1]),3]<-mmu_ortholog[i,2] } rno_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','rnorvegicus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_homologs[hsa_homologs[,3] == "",1], uniqueRows = F, mart = ensembl) rno_homologs<-rno_homologs[!duplicated(rno_homologs$ensembl_gene_id),] rno_ortholog<-rno_homologs[rno_homologs[,3]!= "",c(1,3)] rownames(hsa_homologs)<-hsa_homologs[,1] #Loop through the table and append rat ids for (i in 1:nrow(rno_ortholog)){hsa_homologs[as.character(rno_ortholog[i,1]),3]<-rno_ortholog[i,2] } #Run multimir to get all validated targets, conserved and 20% of top predicted validatedMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "validated", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) mirandaMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "miranda", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) pitaMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "pita", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) targetscanMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "targetscan", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) predictedMirs <- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "predicted", summary = TRUE) write.table(validatedMirs@data,"proteinsTargetsValidated.tsv") write.table(mirandaMirs@data,"proteinsTargetsMiranda.tsv") write.table(pitaMirs@data,"proteinsTargetsPITA.tsv") write.table(targetscanMirs@data,"proteinsTargetsTargetscan.tsv") write.table(predictedMirs@data,"proteinsTargetsPredicted.tsv") quit() y grep targetscan proteinsTargetsPredicted.tsv |cat - proteinsTargetsTargetscan.tsv |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort|uniq>targetScan.tmp grep -v targetscan proteinsTargetsPredicted.tsv |cat - proteinsTargetsPITA.tsv proteinsTargetsMiranda.tsv |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort -r |uniq|sed 1d |sort > nontargetscan.tmp comm -12 nontargetscan.tmp targetScan.tmp >targetPlus1.tmp cut -f 6,8,10,14 -d '"' proteinsTargetsValidated.tsv |sed 's/"/\t/g' |sed 1d |cat - targetPlus1.tmp |sort |uniq >proteinsTargetsDeMirs.txt cd .. ############## translation cd translation R library(biomaRt) library(multiMiR) ensembl=useMart("ensembl") ensembl = useDataset("ccrigri_gene_ensembl",mart=ensembl) filters = listFilters(ensembl) attributes=listAttributes(ensembl) cgr_translation_genes<-readLines(con = "deTranslation.txt") deMirs<-readLines(con="../splicing/deMirs.txt") #Retrieve human mouse and rat annotations hsa_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','hsapiens_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = cgr_translation_genes, uniqueRows = F, mart = ensembl) hsa_homologs<-hsa_homologs[!duplicated(hsa_homologs$ensembl_gene_id),] #retrieve mouse, for those with no human id mmu_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','mmusculus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_homologs[hsa_homologs[,3] == "",1], uniqueRows = T, mart = ensembl) mmu_homologs<-mmu_homologs[!duplicated(mmu_homologs$ensembl_gene_id),] mmu_ortholog<-mmu_homologs[mmu_homologs[,3]!= "",c(1,3)] rownames(hsa_homologs)<-hsa_homologs[,1] #Loop through the table and append mouse ids for (i in 1:nrow(mmu_ortholog)){hsa_homologs[as.character(mmu_ortholog[i,1]),3]<-mmu_ortholog[i,2] } rno_homologs<-getBM(attributes=c('ensembl_gene_id','external_gene_name','rnorvegicus_homolog_ensembl_gene'), filters = 'ensembl_gene_id', values = hsa_homologs[hsa_homologs[,3] == "",1], uniqueRows = F, mart = ensembl) rno_homologs<-rno_homologs[!duplicated(rno_homologs$ensembl_gene_id),] rno_ortholog<-rno_homologs[rno_homologs[,3]!= "",c(1,3)] rownames(hsa_homologs)<-hsa_homologs[,1] #Loop through the table and append rat ids for (i in 1:nrow(rno_ortholog)){hsa_homologs[as.character(rno_ortholog[i,1]),3]<-rno_ortholog[i,2] } #Run multimir to get all validated targets, conserved and 20% of top predicted validatedMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "validated", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) mirandaMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "miranda", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) pitaMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "pita", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) targetscanMirs<- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "targetscan", summary = TRUE, predicted.cutoff.type="p", predicted.cutoff=99) predictedMirs <- get_multimir(org= 'hsa', mirna = deMirs, target = hsa_homologs[hsa_homologs[,3] != "",3], table = "predicted", summary = TRUE) write.table(validatedMirs@data,"translationTargetsValidated.tsv") write.table(mirandaMirs@data,"translationTargetsMiranda.tsv") write.table(pitaMirs@data,"translationTargetsPITA.tsv") write.table(targetscanMirs@data,"translationTargetsTargetscan.tsv") write.table(predictedMirs@data,"translationTargetsPredicted.tsv") quit() y grep targetscan translationTargetsPredicted.tsv |cat - translationTargetsTargetscan.tsv |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort|uniq>targetScan.tmp grep -v targetscan translationTargetsPredicted.tsv |cat - translationTargetsPITA.tsv translationTargetsMiranda.tsv |cut -f 6,8,10,14 -d '"' |sed 's/"/\t/g' |sort -r |uniq|sed 1d |sort > nontargetscan.tmp comm -12 nontargetscan.tmp targetScan.tmp >targetPlus1.tmp cut -f 6,8,10,14 -d '"' translationTargetsValidated.tsv |sed 's/"/\t/g' |sed 1d |cat - targetPlus1.tmp |sort |uniq >translationTargetsDeMirs.txt cd .. ################################ # Thesis Chapter 5 ################################ cd ~/ThesisMaster mkdir Chapter5 && cd Chapter5 mkdir input output preprocessed genome export wd=~/ThesisMaster/Chapter5 export output=$wd/output export genome=~/ThesisMaster/Chapter2/genome export ribogenome=$wd/genome export input=$wd/input export CPU=70 ########################## Section 1 - Preparing Inputs and Genomes #Copy the raw data from backup cp /mnt/synology/Shared/raw_data/sirg/RiboSeq/rpf/rpf-rna-* $input/ cp /mnt/synology/Shared/raw_data/sirg/RiboSeq/controlrna/control-rna-* $input/ ls input/rp*sequence.txt.gz |cut -f 2 -d '/' |cut -f 1 -d '_' >samplefileRibo.txt ls input/co*sequence.txt.gz |cut -f 2 -d '/' |cut -f 1 -d '_' >samplefileRNA.txt export ribo=$wd/samplefileRibo.txt export rna=$wd/samplefileRNA.txt gunzip $input/* #Downloaded fasta sequence of CriGri rRNA and saved in the file $input/rRNA.fa bowtie-build $input/rRNA.fa $input/rRNA.fa #Generate STAR index for long control and short ribo reads #ribo mkdir $ribogenome/ribo $ribogenome/rna STAR --runThreadN $CPU --runMode genomeGenerate --genomeDir $ribogenome/ribo --genomeFastaFiles $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa --sjdbGTFfile $wd/../Chapter3/output/stringtie/merged.gtf --sjdbOverhang 33 --limitGenomeGenerateRAM 200000000000 #rna STAR --runThreadN $CPU --runMode genomeGenerate --genomeDir $ribogenome/rna --genomeFastaFiles $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa --sjdbGTFfile $wd/../Chapter3/output/stringtie/merged.gtf --sjdbOverhang 66 --limitGenomeGenerateRAM 200000000000 #Take protein coding genes from the gtf grep 'transcript_biotype "protein_coding"' /home/craig/ThesisMaster/Chapter2/genome/Cricetulus_griseus_crigri.CriGri_1.0.93.gtf > $ribogenome/protein.gtf ########################## Section 2 - QC #initial QC fastqc -t CPU $input/* #Length filter ribo cat $ribo | while read sample ; do awk 'BEGIN {OFS = "\n"} {header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= 28 && length(seq) <= 34) {print header, seq, qheader, qseq}}' < $input/“$sample”_sequence.txt > $preprocessed/“$sample”.lengthFiltered.fq ; done #length filter RNA cat $rna.txt | while read sample ; do awk 'BEGIN {OFS = "\n"} {header = $0 ; getline seq ; getline qheader ; getline qseq ; if (length(seq) >= 28) {print header, seq, qheader, qseq}}' < $input/"$sample"_sequence.txt > $preprocessed/"$sample".lengthFiltered.fq ; done #rRNA filter ribo cat $ribo |while read file; do bowtie -p $CPU -l 23 --un ../preprocessed/"$file".fq $input/rRNA.fa $preprocessed/"$file".lengthFiltered.fq ; done > tmp #rRNA filter cat $rna |while read file; do bowtie -p $CPU -l 23 --un ../preprocessed/"$file".fq $input/rRNA.fa $preprocessed/"$file".lengthFiltered.fq ; done > tmp fastqc -t CPU $preprocessed/* ######################### Section 3 - Genome alignment mkdir $output/riboAlign $output/rnaAlign #ribo alignment. Use the winanchormultimapnmax=200 and seedsearchstartlmax=15 to adjust alignment for short reads STAR --runThreadN 70 --genomeLoad LoadAndExit --genomeDir $ribogenome/ribo/ cat $ribo |while read file ; do mkdir $output/riboAlign/"$file" ; STAR --genomeLoad LoadAndKeep --runThreadN $CPU --genomeDir $ribogenome/ribo/ --readFilesIn $wd/preprocessed/"$file".fq --outFileNamePrefix $output/riboAlign/"$file"/ --winAnchorMultimapNmax 200 --seedSearchStartLmax 15 & done STAR --genomeLoad Remove --genomeDir $ribogenome/ribo #rna alignment STAR --runThreadN 70 --genomeLoad LoadAndExit --genomeDir $ribogenome/rna/ cat $rna |while read file ; do mkdir $output/rnaAlign/"$file" ; STAR--genomeLoad LoadAndKeep --runThreadN $CPU --genomeDir $ribogenome/rna/ --readFilesIn $wd/preprocessed/"$file".fq --outFileNamePrefix $output/rnaAlign/"$file"/ & done STAR --genomeLoad Remove --genomeDir $ribogenome/rna #sam to bam and sort. Remove sams less $ribo |while read line ; do samtools view -b -@ 70 $output/riboAlign/$line/Aligned.out.sam |samtools sort > $output/riboAlign/$line.bam;done less $rna |while read line ; do samtools view -b -@ 70 $output/rnaAlign/$line/Aligned.out.sam |samtools sort > $output/rnaAlign/$line.bam;done less $ribo | while read line ; do rm $output/riboAlign/$line -r ; done less $rna | while read line ; do rm $output/rnaAlign/$line -r ; done ########################## Section 4 - RiboTaper mkdir $output/Ribotaper Ribotaper_create_annotations_files.bash ../Chapter2/genome/Cricetulus_griseus_crigri.CriGri_1.0.93.gtf $genome/Cricetulus_griseus_crigri.CriGri_1.0.dna.toplevel.fa false false annotation cat $ribo |while read file ; do Ribotaper_create_metaplots.bash $output/riboAlign/"$file".bam $wd/annotation/start_stops_FAR.bed $output/Ribotaper/"$file"_pSites ; done for i in {1..8}; do cd $output/RiboTaper; mkdir $i ;cd $i ;Ribotaper.sh $output/riboAlign/rpf-rna-"$i".bam $output/rnaAlign/control-rna-"$i".bam $wd/annotation/ 28,29,30,31 -12,-12,-12,-12 80 & done ######################### Section 5 - Quantification cd $wd mkdir $output/counts #filter for primary hits as STAR randomly assigned multimappers as the primary alignment cat $ribo |while read line ; do samtools view -b -F 260 $output/riboAlign/"$line".bam > $output/riboAlign/"$line".primary.bam & done cat $rna |while read line ; do samtools view -b -F 260 $output/rnaAlign/"$line".bam > $output/rnaAlign/"$line".primary.bam & done #count ribo featureCounts -g gene_id --primary -a $ribogenome/protein.gtf -o $output/counts/rpfCountsPrimary.tsv -T 64 $output/riboAlign/rpf-rna-1.primary.bam $output/riboAlign/rpf-rna-2.primary.bam $output/riboAlign/rpf-rna-3.primary.bam $output/riboAlign/rpf-rna-4.primary.bam $output/riboAlign/rpf-rna-5.primary.bam $output/riboAlign/rpf-rna-6.primary.bam $output/riboAlign/rpf-rna-7.primary.bam $output/riboAlign/rpf-rna-8.primary.bam 2> $output/counts/rpfCountsLog.txt #count rna featureCounts -g gene_id --primary -a $ribogenome/protein.gtf -o $output/counts/rnaCountsPrimary.tsv -T 64 $output/rnaAlign/control-rna-1.primary.bam $output/rnaAlign/control-rna-2.primary.bam $output/rnaAlign/control-rna-3.primary.bam $output/rnaAlign/control-rna-4.primary.bam $output/rnaAlign/control-rna-5.primary.bam $output/rnaAlign/control-rna-6.primary.bam $output/rnaAlign/control-rna-7.primary.bam $output/rnaAlign/control-rna-8.primary.bam 2> $output/counts/rnaCountsLog.txt #Reformat into R style matrix awk '{print ($1,$7,$8,$9,$10,$11,$12,$13,$14) }' $output/counts/rpfCountsPrimary.tsv |sed 1d | tr ' ' '\t'> $output/counts/rpf_counts.tsv sed -i 's/Geneid//g' $output/counts/rpf_counts.tsv sed -i 's/.primary.bam//g' $output/counts/rpf_counts.tsv sed -i 's/\/home\/craig\/ThesisMaster\/Chapter5\/output\/riboAlign\///g' $output/counts/rpf_counts.tsv awk '{print ($1,$7,$8,$9,$10,$11,$12,$13,$14) }' $output/counts/rnaCountsPrimary.tsv |sed 1d | tr ' ' '\t'> $output/counts/control_counts.tsv sed -i 's/Geneid//g' $output/counts/control_counts.tsv sed -i 's/.primary.bam//g' $output/counts/control_counts.tsv sed -i 's/\/home\/craig\/ThesisMaster\/Chapter5\/output\/rnaAlign\///g' $output/counts/control_counts.tsv rm $output/counts/rnaCountsPrimary.tsv $output/counts/rpfCountsPrimary.tsv $output/counts/rnaCountsPrimary.tsv.summary $output/counts/rpfCountsPrimary.tsv.summary ######################## Section 6 - Differential Translation mkdir $output/DT $output/DT/deseq $output/DT/xtail ###xtail library(xtail) test.rna <- read.table("output/counts/control_counts.tsv") test.rpf <- read.table("output/counts/rpf_counts.tsv") countData<-cbind (test.rpf,test.rna) counts<-countData cpmRibo<-min(colSums(countData[,1:8]))/1000000 cpmRNA<-min(colSums(countData[,9:16]))/1000000 counts<-counts[((rowSums(counts[,1:8])/8)> cpmRibo),] counts<-counts[((rowSums(counts[,9:16])/8)> cpmRNA),] condition<-c("NTS", "NTS", "NTS", "NTS","TS", "TS", "TS", "TS") test.results<-xtail(test.rna,test.rpf,condition) summary(test.results, alpha=0.05) write.xtail(test.results, "output/DT/xtail/test_results.txt",quote=F, sep="\t", log2Rs=TRUE, log2FCs=FALSE) quit() y awk '{if ($8 >= 0.58496250072 )print $0}' output/DT/xtail/test_results.txt|awk '{if ($10 <= 0.05)print $0}' > $output/DT/xtail/xtailUpProtein1.5.tsv awk '{if ($8 <= -0.58496250072 )print $0}' output/DT/xtail/test_results.txt|awk '{if ($10 <= 0.05)print $0}' >$output/DT/xtail/xtailDownProtein1.5.tsv ###DESeq2 #format input for deseq2 paste $output/counts/rpf_counts.tsv $output/counts/control_counts.tsv |cut -f -9,11-18 >$output/counts/allCounts.txt R library(DESeq2) library(edgeR) countData<-read.table("output/counts/allCounts.txt")#1cpm riboseq counts<-countData cpmRibo<-min(colSums(countData[,1:8]))/1000000 cpmRNA<-min(colSums(countData[,9:16]))/1000000 counts<-counts[((rowSums(counts[,1:8])/8)> cpmRibo),] counts<-counts[((rowSums(counts[,9:16])/8)> cpmRNA),] nrow (counts) #=9795 #Create experimental design table ED<-data.frame(seq=factor(c(rep("ribo",8),rep("rna",8))),condition=factor(c(rep("NTS",4),rep("TS",4))), row.names=colnames(counts)) #Create Deseq dataset with experimental design and apply linear model counts<-na.omit(counts) dds<-DESeqDataSetFromMatrix(countData=counts, colData=ED, design = ~ seq + condition + seq:condition) dds$seq<-relevel(dds$seq,ref="rna") dds<-DESeq(dds) res<-results(dds, lfcThreshold=0) res<-na.omit(res) sig<-res[res[,6]<=0.05,] sig<-sig[abs(sig[,2])>(log2(1.5)),] sig <- (sig[order (sig$padj),]) write.table(sig[sig[,2]>0,], file="output/counts/deseq/upTE1.5DESeq2.tsv", sep="\t" ) write.table(sig[sig[,2]< -0,], file="output/counts/deseq/downTE1.5DESeq2.tsv", sep="\t" ) write.table(res, "output/counts/deseq/deseqResults.txt") quit() y #DESeq2 Vs Xtail TE plot cut -f 2 -d '"' $output/DT/deseq/deseqResults.txt |while read line ; do grep -w $line $output/DT/xtail/test_results.txt ; done |cut -f 1 |while read gene ; do grep -w $gene $output/DT/deseq/deseqResults.txt ;done >$output/overlapTE/inBothDeseq.txt cut -f 2 -d '"' $output/DT/deseq/deseqResults.txt |while read line ; do grep -w $line $output/DT/xtail/test_results.txt ; done >$output/overlapTE/inBothXtail.txt $sort data for Xtail vs DESeq FC plot sort inBothXtail.txt | cut -f 1,8 >xtail.tmp sort inBothDeseq.txt | cut -f 1,3 -d ' ' >deseq.tmp paste deseq.tmp xtail.tmp |sed 's/"//g' |sed 's/ /\t/g'|cut -f 1,2,4 >deseqXtailOverlap.txt #added colnames deseq and xtail comm -23 1 4 >deseq comm -13 1 4 >xtail cp 14 ./overlap #Get data for MA cut -f 1,3,7 ../DT/deseq/deseqResults.txt -d ' ' >DESeqMA.tsv ####################### #Plots #foldchange plot R foldchanges<-read.table("deseqXtailOverlap.txt") deseqonly<-as.list(read.table("deseq")) xtailonly<-as.list(read.table("xtail")) bothxd<-as.list(read.table("overlap")) plot(foldchanges,pch=".",cex=2,ylab="xtail TE fold change", xlab="DESeq2 TE fold change") points(na.omit(foldchanges[as.character(bothxd$V1),]),col="orange",pch=16,cex=0.5) points(na.omit(foldchanges[as.character(xtailonly$V1),]),col="firebrick1",pch=16,cex=0.7) points(na.omit(foldchanges[as.character(deseqonly$V1),]),col="blue",pch=16,cex=0.7) legend (x="topleft", legend=c("DESeq","xTail","Both"), col=c("blue","firebrick1","orange"), pch=20) MAdata<-read.table("DESeqMA.tsv") #volcano plot plot (MAdata[,1],-log10(MAdata[,2]),pch=".",cex=2,ylim=c(0,20),xlab="log2FC TE", ylab="-log10 BH adj. P value") points(MAdata[as.character(bothxd$V1),1],-log10(MAdata[as.character(bothxd$V1),2]), col="red",cex=0.7,pch=16) points(MAdata[as.character(xtailonly$V1),1],-log10(MAdata[as.character(xtailonly$V1),2]), col="orange",cex=0.7,pch=16) points(MAdata[as.character(deseqonly$V1),1],-log10(MAdata[as.character(deseqonly$V1),2]), col="blue",cex=0.7,pch=16) legend (x="topleft", legend=c("DESeq","xTail","Both"), col=c("blue","orange","firebrick1"), pch=20) riboFCsigUp=c() rnaFCsigUp=c() riboFCsigDown=c() rnaFCsigDown=c() for (i in 1:length(c(upIDs))) {riboFCsigUp[i]<-(rowMeans(counts[c(upIDs)[i],1:4])/rowMeans(counts[c(upIDs)[i],5:8]))} for (i in 1:length(c(upIDs))) {rnaFCsigUp[i]<-(rowMeans(counts[c(upIDs)[i],9:12])/rowMeans(counts[c(upIDs)[i],13:16]))} for (i in 1:length(c(downIDs))) {riboFCsigDown[i]<-(rowMeans(counts[c(downIDs)[i],1:4])/rowMeans(counts[c(downIDs)[i],5:8]))} for (i in 1:length(c(downIDs))) {rnaFCsigDown[i]<-(rowMeans(counts[c(downIDs)[i],9:12])/rowMeans(counts[c(downIDs)[i],13:16]))}