#!/bin/bash -l

#Specify project
#$ -P scv #replace with your own project_name

#Give the name to the job
#$ -N tophat_example

#Send an email when the job is finished (or aborted)
#$ -m ae

#Join the error and output file
#$ -j y

#Specify qlog directory: 
#$ -o ../qlog # make sure the directory pre-exist before job starts to run

#Request multiple cores:
#since tophat supports multithreading (use --threads option),
#we request 4 cores in this example 
#$ -pe omp 4

# Now let's keep track of key job information:
echo "=========================================================="
echo "Starting on : $(date)"
echo "Running on node : $(hostname)"
echo "Current directory : $(pwd)"
echo "Current job ID : $JOB_ID"
echo "Current job name : $JOB_NAME"
echo "Number of cores: $NSLOTS"
echo "=========================================================="


# define variables: 
DATA_DIR=../data
REF_DIR=../ref
OUT_DIR=../out

#Sepcify the version of bowtie
module load bowtie2/2.4.2
module load tophat/2.1.1

# in this example, let's build everything from scratch
# first removing existing index and rebuild it with the 
# same bowtie2 in the pipeline:
# assign command string to CMD variable
# index shall run only once:
# check if index files already exists: 
if [ -e  ${REF_DIR}/test_ref.bt2 ]; then
    # remove the existing index
    rm -f ${REF_DIR}/test_ref.*.bt2
fi

# build index, and this is not a good example to apply multithreading, since the reference genome is quite small, and it would finish fast even without multiple cores. So just for demonstration purpose that it adopts '--threads'
# in real scenario, user shall have much larger reference genome as well as read data, so add multithread option is an ideal. 

    CMD="bowtie2-build --threads $NSLOTS ${REF_DIR}/test_ref.fa ${REF_DIR}/test_ref"
    # print out $CMD for track/debug
    echo $CMD
    # executate command:
    eval $CMD 
    echo "Done with building index!"

# now that we have built the index, we can go ahead call tophat directly
# incorporate JOB_NAME in the output file name to distinguish output accordingly
# the output will be put in ${OUT_DIR}/tophat_pe_qsub subdirectory
# note - multithread option may differ from one tool to another tool, make sure
#        to use '-h' or '--help' to check and verify the correct spell:
CMD="tophat --num-threads $NSLOTS --no-coverage-search -o ${OUT_DIR}/tophat_pe_qsub ${REF_DIR}/test_ref ${DATA_DIR}/reads_1.fq ${DATA_DIR}/reads_2.fq"
echo $CMD
eval $CMD

# From here you can add many other post-postprocessing steps to further look into the alignment results. 

# print out end message: 
echo "DONE!"