Instructor Notes
This is a placeholder file. Please add content here.
NCBI
BLAST
Instructor Note
Instructor Note
The IPG record can be found here: https://www.ncbi.nlm.nih.gov/ipg/1258700
The fasta file can be found here: https://www.ncbi.nlm.nih.gov/protein/WP_000263098.1?report=fasta
Instructor Note
BASH
scp rpoB_ecoli.fasta <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/blast
Instructor Note
Instructor Note
BASH
blastdbcmd -db landmark -info
blastdbcmd -db nr -info
time blastdbcmd -db landmark -entry WP_000263098 > /dev/null
time blastdbcmd -db nr -entry WP_000263098 > /dev/null
Instructor Note
Instructor Note
Instructor Note
BASH
scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/blast/rpoB_landmark.tab .
Instructor Note
BASH
blastp -db refseq_select_prot -query rpoB_ecoli.fasta -evalue 1e-6 -outfmt 6 > rpoB_refseq_select.tab &
Instructor Note
BASH
cut -f2 rpoB_landmark.tab | sort | uniq > rpoB_landmark_ids
blastdbcmd -db landmark -entry_batch rpoB_landmark_ids > rpoB_homologs.fasta
Instructor Note
Instructor Note
Instructor Note
Instructor Note
BASH
psiblast -query ravC_LP.fasta -db refseq_select_prot -save_pssm_after_last_round -out_pssm ravC_Leg.pssm -out_ascii_pssm ravC_Leg.pssm.txt > ravC_Leg.psiblast
Instructor Note
BASH
psiblast -query ravC_LP.fasta -db refseq_select_prot -save_pssm_after_last_round -out_pssm ravC_Leg.pssm -out_ascii_pssm ravC_Leg.pssm.txt -inclusion_ethresh 1e-10 -evalue 1e-6 > ravC_Leg.psiblast
Instructor Note
BASH
psiblast -query ravC_LP.fasta -db refseq_select_prot -save_pssm_after_last_round -out_pssm ravC_Leg.pssm -out_ascii_pssm ravC_Leg.pssm.txt -inclusion_ethresh 1e-10 -evalue 1e-6 -taxids 118969 > ravC_Leg.psiblast
Instructor Note
psiblast -in_pssm ravC_Leg.pssm -db refseq_select_prot -inclusion_ethresh 1e-10 -evalue 1e-6 -max_target_seqs 1000 -num_iterations 10 > ravC_all.psiblast
Instructor Note
psiblast -in_pssm ravC_Leg.pssm -db refseq_select_prot -inclusion_ethresh 1e-10 -evalue 1e-6 -max_target_seqs 1000 -num_iterations 10 -outfmt “7 qaccver saccver pident length mismatch gapopen qstart qend sstart send qcovs evalue bitscore ssciname scomname sskingdom” > ravC_all.psiblast
Multiple sequence alignment
Instructor Note
BASH
interactive -A uppmax2025-3-4 -M snowy -t 4:00:00
cd /proj/g2020004/nobackup/3MK013/<username>
mkdir phylogenetics
cd phylogenetics
Instructor Note
Instructor Note
Instructor Note
Instructor Note
BASH
scp lionel@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<user>/phylogenetics/rpoB.\*.aln .
Instructor Note
BASH
trimal -in rpoB.einsi.aln -out rpoB.einsi.trimalgt.aln -gt 0.6 -htmlout rpoB.einsi.trimalgt.aln.html
Instructor Note
BASH
trimal -in rpoB.einsi.aln -out rpoB.einsi.trimalauto.aln -automated1 -htmlout rpoB.einsi.trimalauto.aln.html
Instructor Note
BASH
scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/phylogenetics/rpoB.einsi.trimal\* .
scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/phylogenetics/rpoB.einsi.trimal* .
Instructor Note
BASH
module load ClipKIT
clipkit -h
clipkit rpoB.einsi.aln -m smart-gap -l -o rpoB.einsi.clipkit.aln
Instructor Note
BASH
scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<username>/phylogenetics/rpoB.einsi.clipkit.aln .
Then use seaview
or another viewer to visualize and
compare results.
Phylogenetics
Instructor Note
BASH
interactive -A uppmax2025-3-4 -M snowy -t 4:00:00
module load bioinfo-tools iqtree
cd /proj/g2020004/nobackup/3MK013/<username>/phylogenetics
Instructor Note
Reads, QC and trimming
Instructor Note
BASH
interactive -A uppmax2025-3-4 -M snowy -t 04:00:00
cd /proj/g2020004/nobackup/3MK013/<username>
pwd
Instructor Note
BASH
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/195/955/GCF_000195955.2_ASM19595v2/GCF_000195955.2_ASM19595v2_genomic.fna.gz
Instructor Note
Instructor Note
BASH
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/data
for sample in ERR029207 ERR029206 ERR026478 ERR026474 ERR026473 ERR026481 ERR026482
do
for end in 1 2
do
ln -s ../../../data/fastq/"${sample}"_"${end}".fastq.gz .
done
done
Instructor Note
BASH
for infile in *.fastq.gz
do
outfile="${infile}"_trim.fastq
seqtk trimfq -q 0.01 "${infile}" > "${outfile}"
done
Sequence assembly
Instructor Note
BASH
interactive -A uppmax2025-3-4 -M snowy -t 04:00:00
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/
mkdir results/assembly
cd data/trimmed_fastq
Instructor Note
Instructor Note
BASH
for accession in ERR029207 ERR029206 ERR026478 ERR026474 ERR026473 ERR026481 ERR026482
do
echo "${accession}"
gt seqstat "${accession}".fasta | grep -e N50 -e L50
done
Instructor Note
BASH
module load spades
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results
mkdir spades_assembly
cd spades_assembly
for accession in ERR029206
do
echo "${accession}"
spades.py \
--isolate \
-1 ../../data/trimmed_fastq/"${accession}"_1.fastq.gz_trim.fastq \
-2 ../../data/trimmed_fastq/"${accession}"_2.fastq.gz_trim.fastq \
-o "${accession}"_spades \
-t 2 \
-m 8
done
Instructor Note
BASH
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results
gt seqstat assembly/ERR029206.fasta
gt seqstat spades_assembly/ERR029206_spades/contigs.fasta
Instructor Note
BASH
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results
mkdir annotation
cd annotation
module load prodigal
prodigal -i ../assembly/ERR029206.fasta -a ERR029206.faa > ERR029206.gbk
Instructor Note
Instructor Note
Mapping
Getting missing data
If you don’t have all the trimmed reads, extract them from the
archive in the data
folder, using tar
.
tar
is a program to take multiple files, including their
folder structure and put them in a single tar
“ball”. Often
the tarballs are compressed with gzip
.
tar
and gzip
are computationally intensive
and require you to start an interactive session.
BASH
interactive -A uppmax2025-3-4 -M snowy -t 04:00:00
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/data
mv trimmed_fastq trimmed_fastq_partial
tar xvzf ../../../data/trimmed_fastq.tar.gz
ls trimmed_fastq
Instructor Note
BASH
for accession in ERR*; do
echo "${accession}"
cat "${accession}"/snps.txt | grep "Variant-SNP"
done
Instructor Note
BASH
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results/snps
module load bioinfo-tools iqtree
iqtree2 -s core.aln -m MFP+ASC
Instructor Note
BASH
scp <username>@rackham.uppmax.uu.se:/proj/g2020004/nobackup/3MK013/<user>/molepi/results/tree/core.aln.newick .
Instructor Note
BASH
cd /proj/g2020004/nobackup/3MK013/<username>/molepi/results/snps
iqtree2 -s core.aln -m MFP+ASC -b 100 --prefix core.aln.100npb
cd ..
mkdir tree100npb
mv snps/core.aln.100npb* tree100npb/