Instructor Notes

This is a placeholder file. Please add content here.

NCBI


BLAST


Instructor Note

BASH

ssh –Y username@rackham.uppmax.uu.se
cd /proj/g2020004/nobackup/3MK013/<username>/
mkdir blast


Instructor Note

The IPG record can be found here: https://www.ncbi.nlm.nih.gov/ipg/1258700

The fasta file can be found here: https://www.ncbi.nlm.nih.gov/protein/WP_000263098.1?report=fasta



Instructor Note

BASH

scp rpoB_ecoli.fasta <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/blast


Instructor Note

BASH

blastdbcmd -db landmark -entry WP_000263098 > rpoB_ecoli.fasta


Instructor Note

BASH

blastdbcmd -db landmark -info
blastdbcmd -db nr -info

time blastdbcmd -db landmark -entry WP_000263098 > /dev/null
time blastdbcmd -db nr -entry WP_000263098 > /dev/null


Instructor Note

BASH

blastp -db landmark -query rpoB_ecoli.fasta > rpoB_landmark.blast


Instructor Note

BASH

blastp -db landmark -query rpoB_ecoli.fasta -evalue 1e-6 > rpoB_landmark.blast


Instructor Note

BASH

blastp -db landmark -query rpoB_ecoli.fasta -evalue 1e-6 -outfmt 6 > rpoB_landmark.tab

BASH

scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/blast/rpoB_landmark.tab .


Instructor Note

BASH

blastp -db refseq_select_prot -query rpoB_ecoli.fasta -evalue 1e-6 -outfmt 6 > rpoB_refseq_select.tab &

BASH

wc -l rpoB_refseq_select.tab


Instructor Note

BASH

cut -f2 rpoB_landmark.tab | sort | uniq > rpoB_landmark_ids 
blastdbcmd -db landmark -entry_batch rpoB_landmark_ids > rpoB_homologs.fasta


Instructor Note

BASH

grep ">" rpoB_homologs.fasta | less
grep ">" rpoB_homologs.fasta | wc -l 


Instructor Note

BASH

blastdbcmd -db refseq_protein -entry WP_010945868.1 > ravC_LP.fasta


Instructor Note

BASH

psiblast -query ravC_LP.fasta -db refseq_select_prot > ravC_Leg.psiblast


Instructor Note

BASH

psiblast -query ravC_LP.fasta -db refseq_select_prot -save_pssm_after_last_round -out_pssm ravC_Leg.pssm -out_ascii_pssm ravC_Leg.pssm.txt > ravC_Leg.psiblast


Instructor Note

BASH

psiblast -query ravC_LP.fasta -db refseq_select_prot -save_pssm_after_last_round -out_pssm ravC_Leg.pssm -out_ascii_pssm ravC_Leg.pssm.txt -inclusion_ethresh 1e-10 -evalue 1e-6 > ravC_Leg.psiblast


Instructor Note

BASH

psiblast -query ravC_LP.fasta -db refseq_select_prot -save_pssm_after_last_round -out_pssm ravC_Leg.pssm -out_ascii_pssm ravC_Leg.pssm.txt -inclusion_ethresh 1e-10 -evalue 1e-6 -taxids 118969 > ravC_Leg.psiblast


Instructor Note

psiblast -in_pssm ravC_Leg.pssm -db refseq_select_prot -inclusion_ethresh 1e-10 -evalue 1e-6 -max_target_seqs 1000 -num_iterations 10 > ravC_all.psiblast



Instructor Note

psiblast -in_pssm ravC_Leg.pssm -db refseq_select_prot -inclusion_ethresh 1e-10 -evalue 1e-6 -max_target_seqs 1000 -num_iterations 10 -outfmt “7 qaccver saccver pident length mismatch gapopen qstart qend sstart send qcovs evalue bitscore ssciname scomname sskingdom” > ravC_all.psiblast



Multiple sequence alignment


Instructor Note

BASH

ssh -Y <username>@rackham.uppmax.uu.se

BASH

interactive -A uppmax2025-3-4 -M snowy -t 4:00:00
cd /proj/g2020004/nobackup/3MK013/<username>
mkdir phylogenetics
cd phylogenetics


Instructor Note

BASH

cp ../../data/rpoB/rpoB.fasta .


Instructor Note

BASH

module load bioinfo-tools MAFFT
time mafft-fftns rpoB_renamed.fasta > rpoB.fftns.aln


Instructor Note

BASH

time mafft-einsi rpoB_renamed.fasta > rpoB.einsi.aln


Instructor Note

BASH

scp lionel@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<user>/phylogenetics/rpoB.\*.aln .


Instructor Note

BASH

trimal -in rpoB.einsi.aln -out rpoB.einsi.trimalgt.aln -gt 0.6 -htmlout rpoB.einsi.trimalgt.aln.html


Instructor Note

BASH

trimal -in rpoB.einsi.aln -out rpoB.einsi.trimalauto.aln -automated1 -htmlout rpoB.einsi.trimalauto.aln.html


Instructor Note

BASH

scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/phylogenetics/rpoB.einsi.trimal\* .
scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/phylogenetics/rpoB.einsi.trimal* .


Instructor Note

BASH

module load ClipKIT
clipkit -h
clipkit rpoB.einsi.aln -m smart-gap -l -o rpoB.einsi.clipkit.aln


Instructor Note

BASH

scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/phylogenetics/rpoB.einsi.clipkit.aln .

Then use seaview or another viewer to visualize and compare results.



Phylogenetics


Instructor Note

BASH

interactive -A uppmax2025-3-4 -M snowy -t 4:00:00
module load bioinfo-tools iqtree
cd /proj/g2020004/nobackup/3MK013/<username>/phylogenetics


Instructor Note

BASH

iqtree2 -s rpoB.fftns.aln -m MFP -B 1000

BASH

figtree rpoB.fftns.aln.treefile &


Reads, QC and trimming


Instructor Note

BASH

interactive -A uppmax2025-3-4 -M snowy -t 04:00:00
cd /proj/g2020004/nobackup/3MK013/<username>
pwd


Instructor Note

BASH

wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/195/955/GCF_000195955.2_ASM19595v2/GCF_000195955.2_ASM19595v2_genomic.fna.gz


Instructor Note

BASH

seqkit stats GCF_000195955.2_ASM19595v2_genomic.fna


Instructor Note

BASH

cd /proj/g2020004/nobackup/3MK013/<username>/molepi/data
for sample in ERR029207 ERR029206 ERR026478 ERR026474 ERR026473 ERR026481 ERR026482
do
  for end in 1 2
  do
    ln -s ../../../data/fastq/"${sample}"_"${end}".fastq.gz .
  done
done


Instructor Note

BASH

for infile in *.fastq.gz
do
  outfile="${infile}"_trim.fastq
  seqtk trimfq -q 0.01 "${infile}" >  "${outfile}" 
done


Sequence assembly


Instructor Note

BASH

interactive -A uppmax2025-3-4 -M snowy -t 04:00:00
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/
mkdir results/assembly
cd data/trimmed_fastq


Instructor Note

BASH

module load GenomeTools 
gt seqstat ERR026473.fasta


Instructor Note

BASH

for accession in ERR029207 ERR029206 ERR026478 ERR026474 ERR026473 ERR026481 ERR026482
do
  echo "${accession}"
  gt seqstat "${accession}".fasta | grep -e N50 -e L50
done


Instructor Note

BASH

module load spades
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results
mkdir spades_assembly
cd spades_assembly
for accession in ERR029206
do
  echo "${accession}"
  spades.py \
    --isolate \
    -1 ../../data/trimmed_fastq/"${accession}"_1.fastq.gz_trim.fastq \
    -2 ../../data/trimmed_fastq/"${accession}"_2.fastq.gz_trim.fastq \
    -o "${accession}"_spades \
    -t 2 \
    -m 8
done


Instructor Note

BASH

cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results
gt seqstat assembly/ERR029206.fasta
gt seqstat spades_assembly/ERR029206_spades/contigs.fasta


Instructor Note

BASH

cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results
mkdir annotation
cd annotation
module load prodigal
prodigal -i ../assembly/ERR029206.fasta -a ERR029206.faa > ERR029206.gbk


Instructor Note

BASH

module load blast
makeblastdb -dbtype prot -in ERR029206.faa -out ERR029206


Instructor Note

BASH

blastp -query ../../../blast/rpoB_ecoli.fasta -db ERR029206


Mapping


Getting missing data

If you don’t have all the trimmed reads, extract them from the archive in the data folder, using tar. tar is a program to take multiple files, including their folder structure and put them in a single tar “ball”. Often the tarballs are compressed with gzip.

tar and gzip are computationally intensive and require you to start an interactive session.

BASH

interactive -A uppmax2025-3-4 -M snowy -t 04:00:00
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/data
mv trimmed_fastq trimmed_fastq_partial
tar xvzf ../../../data/trimmed_fastq.tar.gz
ls trimmed_fastq


Instructor Note

BASH

for accession in ERR*; do
  echo "${accession}"
  cat "${accession}"/snps.txt | grep "Variant-SNP"
done


Instructor Note

BASH

cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results/snps
module load bioinfo-tools iqtree
iqtree2 -s core.aln -m MFP+ASC


Instructor Note

BASH

scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<user>/molepi/results/tree/core.aln.newick .


Instructor Note

BASH

cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results/snps
iqtree2 -s core.aln -m MFP+ASC -b 100 --prefix core.aln.100npb
cd ..
mkdir tree100npb
mv snps/core.aln.100npb* tree100npb/


Molecular epidemiology