両方とも前のリビジョン 前のリビジョン | |
blast_db検証 [2022/06/18 13:31] – [Fishes全ゲノムデータベース] 118.240.79.152 | blast_db検証 [Unknown date] (現在) – 削除 - 外部編集 (Unknown date) 127.0.0.1 |
---|
accession->taxonomy pathを付与 | |
``` | |
awk -F'\t' 'FILENAME==ARGV[1]{split($1,arr,"."); a[arr[1]]=$0} FILENAME==ARGV[2] && $1 in a{ac2tax[$1]=$3} FILENAME==ARGV[3]{tax2name[$1]=$2} END{for(i in a){print a[i]"\t"tax2name[ac2tax[i]]}}' <(seqkit fx2tab SSU_eukaryote_rRNA.fasta) <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2021-06-01_taxdump/names.dmp.sname.path > SSU_eukaryote_rRNA.fasta.path | |
``` | |
| |
mafftで魚類だけアライメント | |
``` | |
軟骨魚類:root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes | |
硬骨魚類:root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Teleostomi;Euteleostomi;Actinopterygii | |
| |
more SSU_eukaryote_rRNA.fasta.path|egrep "(root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes|root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Teleostomi;Euteleostomi;Actinopterygii)"|awk -F'\t' '{OFS="\t"; gsub(" ","_",$1); print ">"$1; print $2}' > SSU_eukaryote_rRNA.fasta.path.fish.fa | |
mafft.bat --thread 32 SSU_eukaryote_rRNA.fasta.path.fish.fa > SSU_eukaryote_rRNA.fasta.path.fish.fa.mafft | |
``` | |
| |
SSU ITS1 5.8S ITS2 LSUが完璧なもの(ヒトとか)のみ抽出 | |
``` | |
awk -F'\t' 'FILENAME==ARGV[1]{a[substr($0,2)]=1} FILENAME==ARGV[2]&&$1 in a{gsub(" ","_",$1); print ">"$1; print $2}' <(cat /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/ITS_eukaryote_sequences.fasta |grep "SSU ITS1 5.8S ITS2 LSU"|grep -v ">$"|grep -v "<") <(seqkit fx2tab /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/ITS_eukaryote_sequences.fasta) > ITS_eukaryote_sequences.fasta.complete.fa | |
``` | |
| |
## SSU-LSUをblast結果から抜き出す | |
| |
`/suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/run.sh` | |
``` | |
#!/bin/bash | |
| |
N_CPU=8 | |
maxgap=3000 | |
mincov=0.8 | |
minscore=3000 | |
| |
ref="$1" | |
| |
makeblastdb -in $ref -dbtype nucl | |
blastn -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -db $ref -query /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/ITS_eukaryote_sequences.fasta.complete.fa -max_target_seqs 10000 -num_threads $N_CPU > $ref.its.blastn | |
cat $ref.its.blastn|sort -k14,14nr| | |
awk 'NR==1{name=$1} $1==name{print $0}'|sort -k2,2 -k11,11n -k12,12n| | |
awk -F'\t' -v maxgap=$maxgap 'BEGIN{ | |
OFS="\t"; | |
} | |
{ | |
if($11<$12){o="+"}else{o="-"}; | |
if(oldqname==$1&&oldsname==$2&&oldo==o){ | |
if(o=="+"){deltaq=$9-oldq2; deltas=$11-olds2}else{deltaq=oldq1-$10; deltas=$12-olds1}; | |
str=deltaq"\t"deltas | |
} | |
if(!(oldqname==$1&&oldsname==$2&&oldo==o && deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap)){ | |
if(oldsname!=""){ | |
if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} | |
else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; | |
} | |
str="na\tna"; deltaq=0; deltas=0; jointq1=0; joints1=0; jointq2=0; jointscore=0; | |
}; | |
if(jointq1==0&&jointq2==0){ | |
if(o=="+"){jointq1=$9; joints1=$11}else{jointq2=$10; joints1=$12} #-の時はqは終わりから始まる | |
}; | |
if(o=="+"){tempq2=$10; temps2=$12}else{tempq1=$9; temps2=$11}; | |
if(deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap){ | |
if(o=="+"){str2=o"\t"jointq1","$10","$10-jointq1","joints1","$12","$12-joints1} | |
else{str2=o","$9","jointq2","jointq2-$9","joints1","$11","$11-joints1}; | |
print "joint::"str2 | |
}; | |
oldq1=$9; oldq2=$10; olds1=$11; olds2=$12; oldqname=$1; oldsname=$2; oldo=o; oldqlen=$3; oldslen=$4 | |
jointscore+=$14 | |
$13=o"\t"str"\t"$13; | |
print $0; | |
} END{ | |
if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} | |
else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; | |
}' > $ref.its.blastn.tab | |
cat $ref.its.blastn.tab|grep "^result:"|cut -f 2-|sort -k12nr|awk -F'\t' -v mincov=$mincov -v minscore=$minscore '$10>=$4*mincov&&$12>=minscore' |tee $ref.its.blastn.tab.result | |
| |
seqkit fx2tab $ref |awk -F'\t' 'function revcomp(x, i, b){b=""; for(i=length(x);i>0;i--){switch(toupper(substr(x,i,1))){case "A":b=b"T";break; case "C":b=b"G";break; case "G":b=b"C";break; case "T":b=b"A";break; default:b=b"N";break}}; return b} FILENAME==ARGV[1]{split($1,arr," "); seq[arr[1]]=$2} FILENAME==ARGV[2]{print ">"$2":"$8"-"$9":"$3; if($3=="+"){print substr(seq[$2],$8,$9-$8+1)}else{print revcomp(substr(seq[$2],$8,$9-$8+1))}}' /dev/stdin $ref.its.blastn.tab.result > $ref.its.blastn.tab.result.fasta | |
| |
``` | |
| |
## ゲノムFASTAファイル一括ダウンロードしてssu~lsuを抽出 | |
| |
https://www.ncbi.nlm.nih.gov/genome/browse#!/eukaryotes/fishes | |
| |
NCBI Genome DBでfishesと検索して出てきた結果を「Download」ボタンを押してダウンロード。「eukaryotes.csv」がダウンロードされる。 | |
| |
``` | |
more eukaryotes.csv|awk '{flag=-1; str=""; for(i=1;i<=length($0);i++){t=substr($0,i,1); if(t=="\""){flag=-flag; str=str""t}else if(flag==1&&t==","){str=str"_"}else{str=str""t}}; print str}'|awk -F, '{if($15!=""){print $15}else if($16!=""){print $16}else{print NR,"error"}}'|tail -n+2|grep -v error$|sed 's/"//g; s/$/\//'|awk '{print "wget -t 0 -c -rl1 --accept=.fna.gz "$0}'|while read i; do $i; done | |
#途中で止まっているファイルが2%くらいあるので、もう一度実行して途中で止まったダウンロードをレジュームしておく。 | |
#それでも変になっているファイルがあるので、zcatするとinvalid compressed data--format violatedと出てくるエラーメッセージを集めて下記のコマンドで再度ダウンロード | |
more error.invalid.list |awk '{print $2}'|sed 's/:$//'|sed '/^$/d'|awk '{a=$0; sub("/suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/","",a); print "rm -f "$0"; wget http://"a}'|xargs -I{} bash -c "{}" | |
more error.invalid.list |awk '{print $2}'|sed 's/:$//'|sed '/^$/d'|sort|uniq|awk -F/ '{print "mv "$NF" "$0}'|xargs -I{} bash -c "{}" | |
| |
``` | |
| |
ssu~lsuを抽出 | |
``` | |
find ftp.ncbi.nlm.nih.gov/|grep _genomic.fna.gz|grep -v _from_genomic.fna.gz|while read i; do qsub -j y ~/qsubsh8 ./run-for-gz.sh $i; done | |
``` | |
| |
``` | |
$ more run-for-gz.sh | |
#!/bin/bash | |
| |
N_CPU=8 | |
maxgap=3000 | |
mincov=0.8 | |
minscore=3000 | |
| |
resdir="$PWD/ssu-lsu" | |
mkdir -p "$resdir" | |
base=`basename $1 .gz` | |
mkdir /tmp/$base | |
zcat "$1" > /tmp/$base/$base | |
#ref="$1" | |
ref=/tmp/$base/$base | |
| |
makeblastdb -in $ref -dbtype nucl | |
blastn -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -db $ref -query /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28 | |
S/ITS_eukaryote_sequences.fasta.complete.fa -max_target_seqs 10000 -num_threads $N_CPU > $ref.its.blastn | |
cat $ref.its.blastn|sort -k14,14nr| | |
awk 'NR==1{name=$1} $1==name{print $0}'|sort -k2,2 -k11,11n -k12,12n| | |
awk -F'\t' -v maxgap=$maxgap 'BEGIN{ | |
OFS="\t"; | |
} | |
{ | |
if($11<$12){o="+"}else{o="-"}; | |
if(oldqname==$1&&oldsname==$2&&oldo==o){ | |
if(o=="+"){deltaq=$9-oldq2; deltas=$11-olds2}else{deltaq=oldq1-$10; deltas=$12-olds1}; | |
str=deltaq"\t"deltas | |
} | |
if(!(oldqname==$1&&oldsname==$2&&oldo==o && deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap)){ | |
if(oldsname!=""){ | |
if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} | |
else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; | |
} | |
str="na\tna"; deltaq=0; deltas=0; jointq1=0; joints1=0; jointq2=0; jointscore=0; | |
}; | |
if(jointq1==0&&jointq2==0){ | |
if(o=="+"){jointq1=$9; joints1=$11}else{jointq2=$10; joints1=$12} #-の時はqは終わりから始まる | |
}; | |
if(o=="+"){tempq2=$10; temps2=$12}else{tempq1=$9; temps2=$11}; | |
if(deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap){ | |
if(o=="+"){str2=o"\t"jointq1","$10","$10-jointq1","joints1","$12","$12-joints1} | |
else{str2=o","$9","jointq2","jointq2-$9","joints1","$11","$11-joints1}; | |
print "joint::"str2 | |
}; | |
oldq1=$9; oldq2=$10; olds1=$11; olds2=$12; oldqname=$1; oldsname=$2; oldo=o; oldqlen=$3; oldslen=$4 | |
jointscore+=$14 | |
$13=o"\t"str"\t"$13; | |
print $0; | |
} END{ | |
if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} | |
else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; | |
}' > $ref.its.blastn.tab | |
cat $ref.its.blastn.tab|grep "^result:"|cut -f 2-|sort -k12nr|awk -F'\t' -v mincov=$mincov -v minscore=$minscore '$10>=$4*mincov&&$12>=minscore' |tee $ref.its.blastn.tab.result | |
| |
seqkit fx2tab $ref |awk -F'\t' 'function revcomp(x, i, b){b=""; for(i=length(x);i>0;i--){switch(toupper(substr(x,i,1))){case "A":b=b"T";break; case "C":b=b"G";break; case "G":b=b"C";break; case "T":b | |
=b"A";break; default:b=b"N";break}}; return b} FILENAME==ARGV[1]{split($1,arr," "); seq[arr[1]]=$2} FILENAME==ARGV[2]{print ">"$2":"$8"-"$9":"$3; if($3=="+"){print substr(seq[$2],$8,$9-$8+1)}else{prin | |
t revcomp(substr(seq[$2],$8,$9-$8+1))}}' /dev/stdin $ref.its.blastn.tab.result > $ref.its.blastn.tab.result.fasta | |
| |
mv $ref.its.blastn.tab.result.fasta $ref.its.blastn "$resdir"/ | |
rm -rf /tmp/$base | |
``` | |
| |
taxonomy pathを取得 | |
``` | |
more eukaryotes.csv |tail -n+2|awk '{flag=-1; str=""; for(i=1;i<=length($0);i++){t=substr($0,i,1); if(t=="\""){flag=-flag; str=str""t}else if(flag==1&&t==","){str=str"_"}else{str=str""t}}; print str}'|cut -f 1,6 -d,|sed 's/^"//; s/"$//; s/","/\t/'|sed "s/'//g"|awk -F'\t' 'FILENAME==ARGV[1]{a[$3]=$1} FILENAME==ARGV[2]{path[$1]=$2} FILENAME==ARGV[3]{print $2"\t"$1"\t"a[$1]"\t"path[a[$1]]}' <(cat /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname|sed "s/'//g") /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > eukaryotes.csv.path | |
``` | |
| |
新規に追加できた種数を確認(1種につき、ゲノムは複数存在するため) | |
``` | |
for i in ssu-lsu/*.fasta; do if [ -s "$i" ]; then basename $i|sed 's/[.].*//'; fi; done|awk -F'\t' 'FILENAME==ARGV[1]{OFS="\t"; split($1,arr,"."); a[arr[1]]=$4} FILENAME==ARGV[2]{print $1"\t"a[$1]}' eukaryotes.csv.path /dev/stdin |cut -f 2|sort|uniq|wc -l | |
``` | |
| |
## Fishes全ゲノムデータベース | |
| |
``` | |
find /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/ftp.ncbi.nlm.nih.gov/|grep _genomic.fna.gz|grep -v _from_genomic.fna.gz|while read i ; do j=`basename $i _genomic.fna.gz|cut -f 1 -d.`; zcat $i|sed 's/^>/>'"$j"'__/'; done > fishes.all.genome.fa | |
/suikou/tool/ncbi-blast-2.13.0+/bin/makeblastdb -in fishes.all.genome.fa -dbtype nucl -parse_seqids -hash_index | |
| |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db "fishes.all.genome.fa /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/SSU_LSU_ITS_mito_plastid.maskadapters.desc.fa /suikou/db/ncbi/2021-05-24_nt_v5_formal/nt" -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 40 > DRR086650.fasta.blastn | |
| |
#下記のエラー | |
Error: NCBI C++ Exception: | |
T0 "/home/coremake/release_build/build/PrepareRelease_Linux64-Centos_JSID_01_660219_130.14.18.128_9008__PrepareRelease_Linux64-Centos_1643834072/c++/compilers/unix/../../src/corelib/ncbiobj.cpp", line 992: Critical: (CCoreException::eNullPtr) ncbi::CObject::ThrowNullPointerException() - Attempt to access NULL pointer. | |
T0 "/home/coremake/release_build/build/PrepareRelease_Linux64-Centos_JSID_01_660219_130.14.18.128_9008__PrepareRelease_Linux64-Centos_1643834072/c++/compilers/unix/../../src/app/blast/blast_app_util.cpp", line 773: Critical: (CCoreException::eNullPtr) BLAST::ncbi::BlastFormatter_PreFetchSequenceData() - Error pre-fetching sequence data | |
Stack trace: | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::CObject::ThrowNullPointerException() offset=0x9C addr=0x16d38fc | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn() [0x122c5fc] offset=0x0 addr=0x122c5fc | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::objects::CBioseq_Handle::IsSetInst_Length() const offset=0x9 addr=0x122c919 | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::objects::CBioseq_Handle::GetBioseqLength() const offset=0xE addr=0x122ccce | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::blast::LoadSequencesToScope(std::vector<ncbi::objects::CSeq_id_Handle, std::allocator<ncbi::objects::CSeq_id_Handle> >&, std::vector<ncbi::CRange<unsigned int>, std::allocator<ncbi::CRange<unsigned int> > >&, ncbi::CRef<ncbi::objects::CScope, ncbi::CObjectCounterLocker>&) offset=0x639 addr=0xd26399 | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::BlastFormatter_PreFetchSequenceData(ncbi::blast::CSearchResultSet const&, ncbi::CRef<ncbi::objects::CScope, ncbi::CObjectCounterLocker>, ncbi::blast::CFormattingArgs::EOutputFormat) offset=0x89F addr=0xa065bf | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 CBlastnApp::x_RunMTBySplitDB() offset=0x1B41 addr=0x9fbd91 | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 CBlastnApp::Run() offset=0x34 addr=0x9fe7a4 | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::CNcbiApplicationAPI::x_TryMain(ncbi::EAppDiagStream, char const*, int*, bool*) offset=0x13B addr=0x16274fb | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::CNcbiApplicationAPI::AppMain(int, char const* const*, char const* const*, ncbi::EAppDiagStream, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) offset=0x6FD addr=0x162af2d | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 main offset=0x69 addr=0x9c4079 | |
/lib64/libc.so.6 ???:0 __libc_start_main offset=0xF5 addr=0x7f4c31307b35 | |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn() [0x9f0db3] offset=0x0 addr=0x9f0db3 | |
| |
#一つずつblastをかけることに | |
for i in `seq -w 0 99` `seq 100 108`; do echo $i; time /suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db fishes.all.genome.fa.$i -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 40 > DRR086650.fasta.blastn.fish.$i; done | |
#taxonomy path追加 | |
cat DRR086650.fasta.blastn.fish.*|awk -F'\t' 'FILENAME==ARGV[1]{split($1,arr,"."); path[arr[1]]=$4} FILENAME==ARGV[2]{split($2,arr,"__"); print $0"\t"path[arr[1]]}' eukaryotes.csv.path /dev/stdin > DRR086650.fasta.blastn-fish-all | |
| |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db /suikou/db/ncbi/2021-05-24_nt_v5_formal/nt -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 32 > DRR086650.fasta.blastn.nt | |
cat DRR086650.fasta.blastn.nt|awk -F'\t' 'FILENAME==ARGV[1]{split($2,arr,"|"); split(arr[4],arr2,"."); a[arr2[1]]=1} FILENAME==ARGV[2]&& $1 in a{acc2taxid[$1]=$3} FILENAME==ARGV[3]{taxid2path[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"|"); split(arr[4],arr2,"."); print $0"\t"taxid2path[acc2taxid[arr2[1]]]}' DRR086650.fasta.blastn.nt <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > DRR086650.fasta.blastn.nt.path | |
| |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/SSU_LSU_ITS_mito_plastid.maskadapters.desc.fa -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 32 > DRR086650.fasta.blastn.ssu | |
cat DRR086650.fasta.blastn.ssu|awk -F'\t' 'FILENAME==ARGV[1]{split($2,arr,"|"); split(arr[2],arr2,"."); a[arr2[1]]=1} FILENAME==ARGV[2]&& $1 in a{acc2taxid[$1]=$3} FILENAME==ARGV[3]{taxid2path[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"|"); split(arr[2],arr2,"."); print $0"\t"taxid2path[acc2taxid[arr2[1]]]}' DRR086650.fasta.blastn.ssu <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > DRR086650.fasta.blastn.ssu.path | |
| |
/suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db /suikou/db/pr2/v4.14.0/pr2_version_4.14.0_SSU_taxo_long.fasta -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 16 > DRR086650.fasta.blastn.pr2 | |
cat DRR086650.fasta.blastn.pr2|awk -F'\t' 'FILENAME==ARGV[1]{split($2,arr,"."); a[arr[1]]=1} FILENAME==ARGV[2]&& $1 in a{acc2taxid[$1]=$3} FILENAME==ARGV[3]{taxid2path[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"."); print $0"\t"taxid2path[acc2taxid[arr[1]]]}' DRR086650.fasta.blastn.pr2 <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > DRR086650.fasta.blastn.pr2.path | |
| |
cat DRR086650.fasta.blastn-fish-all DRR086650.fasta.blastn.nt.path DRR086650.fasta.blastn.ssu.path DRR086650.fasta.blastn.pr2.path|sort -k1,1V -k14,14nr --parallel 40 -S 100G > DRR086650.fasta.all.blastn | |
| |
#LCA | |
awk -F'\t' -v percent=0.99 ' | |
#検索用の関数 関数内でローカル変数を使用するために、関数内で使用する全変数を引数の後ろ側につけておく。これはawkのいけてない仕様だそう。 | |
function searchLCA(data, i, j, res, res2, str, n, stopflag){ | |
#同じリードのデータを全て検索して共通していないパスを空白に置換しておく。 | |
for(i in data){ | |
if(n==0){n=split(i,res,";")} | |
else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}} | |
} | |
if(res[1]!=""){str=res[1]}else{str="unknown"; stopflag=1}; | |
for(i=2;i<=n;i++){if(stopflag==0 && res[i]!=""){str=str";"res[i]}else{stopflag=1}} | |
return str; | |
} | |
{ | |
#新しいリードに変わったら、いろんな変数を初期化しておく | |
if($1!=old){if($17!=""){if(old!=""){print searchLCA(data)"\t"oldstr}; delete data; basescore=$14; data[$17]=1; old=$1; oldstr=$0}} | |
#前の行と同じリードで、閾値以上(bitscoreがトップヒットの99%以上)であればデータ配列に突っ込む | |
else if($14>=basescore*percent && $17!=""){data[$17]=1} | |
} | |
END{print searchLCA(data)"\t"oldstr}' DRR086650.fasta.all.blastn > DRR086650.fasta.all.blastn.lca | |
| |
more DRR086650.fasta.all.blastn.lca|awk -F'\t' '$7>$4*0.8 && $6>90'|cut -f 1|sort|uniq -c |sort -nr|grep "root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata"|more | |
| |
``` | |
| |
## PR2データベース検証 | |
| |
``` | |
wget https://github.com/pr2database/pr2database/releases/download/v4.14.0/pr2_version_4.14.0_SSU_taxo_long.fasta.gz | |
| |
more pr2_version_4.14.0_SSU_taxo_long.fasta|grep ">"|sed 's/^>//'|awk -F. '{print $1"\t"$0}'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=$2} FILENAME==ARGV[2]{path[$1]=$2} FILENAME==ARGV[3] && $1 in a{print a[$1]"\t"path[$3]}' /dev/stdin /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) > pr2_version_4.14.0_SSU_taxo_long.fasta.path | |
``` | |
| |
``` | |
$ more pr2_version_4.14.0_SSU_taxo_long.fasta|grep ">"|cut -f 2,3 -d "|"|sort|uniq -c |sort -nr | |
183237 18S_rRNA|nucleus | |
7968 16S_rRNA| | |
6350 16S_rRNA|plastid | |
22 16S_rRNA|apicoplast | |
16 16S_rRNA|mitochondrion | |
7 16S_rRNA|chromatophore | |
2 18S_rRNA|nucleomorph | |
``` | |
| |
| |