#!/bin/bash -l

#Specify project
#$ -P scv #replace with your own project_name

#Give the name to the job
#$ -N gatk_SRR062634_200   # give specific job name related to main purpose of the task

#Send an email when the job is finished (or aborted)
#$ -m ae

#Join the error and output file
#$ -j y

#Specify qlog directory: 
#$ -o qlog

#Request multiple cores:
#$ -pe omp 2

#Request memory
#$ -l mem_per_core=4g

# Now let's keep track of key job information:
echo "=========================================================="
echo "Starting on : $(date)"
echo "Running on node : $(hostname)"
echo "Current directory : $(pwd)"
echo "Current job ID : $JOB_ID"
echo "Current job name : $JOB_NAME"
echo "Number of cores: $NSLOTS"
echo "=========================================================="

# define variables: 
# get current working directory, assume this is where all the example data/script located; you can also use the absolute path if you prefer
WORK_DIR=`pwd`  # use absolute path instead
DATA_DIR=${WORK_DIR}/data
REF_DIR=${WORK_DIR}/ref
OUT_DIR=${WORK_DIR}/out
QSUB_DIR=${WORK_DIR}/qsub

# set gatk tmp dir:
GATK_TMPDIR=${WORK_DIR}/tmp/
mkdir -p $GATK_TMPDIR

# load all the modules needed:
# When load modules, please sepcify the version of the software module
# this will help data reproducibility and record-keeping.
module load sratoolkit/2.10.5 # used for downloading sample data from NCBI
module load bwa/0.7.17 # used for alignment
module load picard/2.25.2 # used for pre- and post-processing on sequence/reference data
module load gatk/4.2.0.0  # used for variant calling. 


# use the built reference from GATK resource bundle by linking ref directory 
# to the existing one on GATK share resource:
cd ${REF_DIR}
# link to the central repository of reference data if not yet done so:
if [ ! -L Homo_sapiens_assembly38.fasta ]; then
ln -s /share/pkg.7/gatk/resource/v0/* .
fi

# set reference files: 
REF=${REF_DIR}/Homo_sapiens_assembly38.fasta
DICT=${REF_DIR}/Homo_sapiens_assembly38.dict
SNP=${REF_DIR}/Homo_sapiens_assembly38.dbsnp138.vcf
INDEL=${REF_DIR}/Homo_sapiens_assembly38.known_indels.vcf.gz

# use NCBI's sratoolkit to get sample data:
cd $DATA_DIR
SAMPLE_ACC=SRR062634 # specify the accession number for the sequence data interested
MAX_SPOT=200 # only fetch the first 200 spots to keep the example small
SAMPLE_BASENAME=${SAMPLE_ACC}_X${MAX_SPOT} # incorporate acc in names
SAMPLE_DATADIR=${DATA_DIR}/$SAMPLE_BASENAME
mkdir -p $SAMPLE_DATADIR

# output directory:
SAMPLE_OUTDIR=$OUT_DIR/${JOB_NAME}_${JOB_ID} # incorporate job name and job id in the result folder, to help track and match code and its result
mkdir -p $SAMPLE_OUTDIR
SAMPLEBASE=$SAMPLE_OUTDIR/$SAMPLE_ACC

# STEP 0: 
# download the first 200 spots of SRR062634
# NOTE: there is a bug in sratoolkit, refer to:https://github.com/ncbi/sra-tools/issues/381. Copied below: 
##################
# This is not a minor typo. You have switched the option from --split-3 to --split-e and so all processing breaks. However, the documentation still mentions --split-3.

# Please clarify whether moving forward the option will be --split-3 or --split-e?

# #336 (comment)

# split-3 is the correct spelling of the option.
#################
# for now use the '--split-e' instead of the meant '--split-3'
cd $SAMPLE_DATADIR
CMD="
time fastq-dump -X $MAX_SPOT  --split-e  ${SAMPLE_ACC}
"
echo $CMD
eval $CMD

# define R1, R2
R1=${SAMPLE_DATADIR}/${SAMPLE_ACC}_1.fastq
R2=${SAMPLE_DATADIR}/${SAMPLE_ACC}_2.fastq

# set read group:
# embed sample name in all the header info fields:
sampleRgID=${SAMPLE_ACC}-LANE001 ### This needs to be a UNIQUE label!!
sampleRgSM=${SAMPLE_ACC} ## Sample. This will be used to name the output vcf
sampleRgLB=${SAMPLE_ACC} ## Library
sampleRgPL=illumina ## Platform/technology
sampleRgPU=${SAMPLE_ACC}

# DONE with preparing sequence data

# Put the sample through the GATK pipeline (R1,R2) to recalibrated GATK bam file

# Step 1:
# Alignment to reference sequence using bwa
# Align (R1,R2) to reference sequence -> sam file
# -t: number of threads, pass $NSLOTS, which has the value of total cores requested for the job.
# -a: Output all found alignments for single-end or unpaired paired-end reads. These alignments will be flagged as secondary alignments.
# -M: Mark shorter split hits as secondary (for Picard compatibility).
# use 'man bwa' for more information
BWA_SAM_OUT=${SAMPLEBASE}-bwa.sam
CMD="
time bwa mem \
     -a -M \
     -t $NSLOTS \
     $REF \
     $R1 $R2 \
     > $BWA_SAM_OUT 2> ${SAMPLEBASE}-bwa.err
"
echo $CMD
eval $CMD

# Step 2:
# Add read groups, sort the reads by coordinate, save as a bam file and index the bam file
# note: picard command line format has been changed since 2.21
CMD="
time java -Djava.io.tmpdir=$GATK_TMPDIR -jar \
     $PICARD_JAR AddOrReplaceReadGroups \
     -I $BWA_SAM_OUT \
     -O ${SAMPLEBASE}-sort-rg.bam \
     --SORT_ORDER coordinate \
     --CREATE_INDEX true \
     --RGID $sampleRgID --RGSM $sampleRgSM --RGLB $sampleRgLB --RGPL $sampleRgPL --RGPU $sampleRgPU\
     >  ${SAMPLEBASE}-rgsort.out 2> ${SAMPLEBASE}-rgsort.err
"
echo $CMD
eval $CMD

# STEP 3:
# Mark Duplicate reads
CMD="
time java -Djava.io.tmpdir=$GATK_TMPDIR \
          -jar $PICARD_JAR MarkDuplicates \
          -I ${SAMPLEBASE}-sort-rg.bam \
          -O ${SAMPLEBASE}-sort-rg-dedup.bam \
	  -M ${SAMPLEBASE}-dedup-metric.txt \
	  --ASSUME_SORT_ORDER coordinate \
	  --CREATE_INDEX true \
	  > ${SAMPLEBASE}-dedup.out 2>${SAMPLEBASE}-dedup.err
"
echo $CMD
eval $CMD

# STEP 4: 
# Indel realignment (create targets)
# Note: RealignerTargetCreator and IndelRealigner functions no longer supported by GATK4, need to switch to GATK3 to complete STEP 4 and STEP5:
module switch gatk/3.8-1

CMD="
time java -Djava.io.tmpdir=$GATK_TMPDIR \
             -jar $SCC_GATK_DIR/install/GenomeAnalysisTK.jar \
             -T RealignerTargetCreator \
             -R $REF \
             -I ${SAMPLEBASE}-sort-rg-dedup.bam \
             -known $INDEL \
             -o ${SAMPLEBASE}-realign-targets.intervals \
     > ${SAMPLEBASE}-realigntarget.out 2> ${SAMPLEBASE}-realigntarget.err
"
echo $CMD
eval $CMD

# STEP 5:
# Indel realignment (Perform local realignment )
# again IndelRealigner is no longer supporty by GATK4, need to use GATK3 to do it
CMD="
time java -Djava.io.tmpdir=$GATK_TMPDIR
          -jar $SCC_GATK_DIR/install/GenomeAnalysisTK.jar \
	  -T IndelRealigner \
	  -R $REF \
	  -known $INDEL \
	  -targetIntervals ${SAMPLEBASE}-realign-targets.intervals  \
	  -I ${SAMPLEBASE}-sort-rg-dedup.bam \
	  -o ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \
          --filter_bases_not_stored \
     >${SAMPLEBASE}-realignbam.out 2>${SAMPLEBASE}-realignbam.err
"
echo $CMD
eval $CMD

#switch back to gatk/4.2.0.0
module load gatk/4.2.0.0

# STEP 6:
# Create bam file index for the realigned bam
CMD="
time java -Djava.io.tmpdir=$GATK_TMPDIR \
          -jar $PICARD_JAR BuildBamIndex \
          -I ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \
     >${SAMPLEBASE}-realignbamindex.out 2>${SAMPLEBASE}-realignbamindex.err
"
echo $CMD
eval $CMD

# STEP 7:
# Base Quality Score Recalibration (Create table)
# This steps detects systematic errors in base quality scores
CMD="
time java -Djava.io.tmpdir=$GATK_TMPDIR \
          -jar $GATK_LOCAL_JAR BaseRecalibrator \
          -R $REF \
	  -known-sites $SNP \
	  -known-sites $INDEL \
	  -I ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \
	  -O ${SAMPLEBASE}-recal-table.txt \
     >${SAMPLEBASE}-recalibrate.out 2>${SAMPLEBASE}-recalibrate.err
"
echo $CMD
eval $CMD

# STEP 8:
# apply Base Quality Score Recalibration (BQSR) 
# Create recalibrated bam file. This file will be used for all subsequent analyses
# it could be time and memory consuming
# we add '-Xmx8g' option according to our requested source: 2 cores, 4g per core, total 8g
CMD="
time java -Xmx8g -Djava.io.tmpdir=$GATK_TMPDIR \
          -jar $GATK_LOCAL_JAR ApplyBQSR \
          -R $REF \
          -I ${SAMPLEBASE}-sort-rg-dedup-realigned.bam \
          -bqsr-recal-file ${SAMPLEBASE}-recal-table.txt \
          -O  ${SAMPLEBASE}-GATK.bam \
     > ${SAMPLEBASE}-recalbam.out 2> ${SAMPLEBASE}-recalbam.err
"
echo $CMD
eval $CMD

# STEP 9:
# Create bam file index for the GATK bam file
CMD="
time java -Djava.io.tmpdir=$GATK_TMPDIR \
          -jar $PICARD_JAR BuildBamIndex \
          -I ${SAMPLEBASE}-GATK.bam \
     >${SAMPLEBASE}-GATKbamindex.out 2>${SAMPLEBASE}-GATKbamindex.err
"
echo $CMD
eval $CMD

# STEP 10:
# Call variants using Haplotype Caller
# https://software.broadinstitute.org/gatk/documentation/tooldocs/current/org_broadinstitute_gatk_tools_walkers_haplotypecaller_HaplotypeCaller.php
# This step is very time/memory intensive, can be optimized using multithreading and more memory allocation.
# for memory, use java option '-Xmx' to specify custom ideal total memory to allocate to the job
# for multithreading, BROAD INSTITUTION suggest to use gatk spark instead of the local version, 
# which is beyond this example.

CMD="
time java -Xmx8g -Djava.io.tmpdir=$GATK_TMPDIR \
     -jar $GATK_LOCAL_JAR HaplotypeCaller \
     -R $REF \
     -I ${SAMPLEBASE}-GATK.bam \
     --dbsnp  $SNP \
     -O ${SAMPLEBASE}-GATK-HC.vcf 
     >${SAMPLEBASE}-GATK-HC.out 2>${SAMPLEBASE}-GATK-HC.err
"
echo $CMD
eval $CMD

# print out end message: 
echo "DONE!"