blast_db検証

accession→taxonomy pathを付与

awk -F'\t' 'FILENAME==ARGV[1]{split($1,arr,"."); a[arr[1]]=$0} FILENAME==ARGV[2] && $1 in a{ac2tax[$1]=$3} FILENAME==ARGV[3]{tax2name[$1]=$2} END{for(i in a){print a[i]"\t"tax2name[ac2tax[i]]}}' <(seqkit fx2tab SSU_eukaryote_rRNA.fasta) <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2021-06-01_taxdump/names.dmp.sname.path > SSU_eukaryote_rRNA.fasta.path

mafftで魚類だけアライメント

軟骨魚類:root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes
硬骨魚類:root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Teleostomi;Euteleostomi;Actinopterygii

more SSU_eukaryote_rRNA.fasta.path|egrep "(root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes|root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Teleostomi;Euteleostomi;Actinopterygii)"|awk -F'\t' '{OFS="\t"; gsub(" ","_",$1); print ">"$1; print $2}' > SSU_eukaryote_rRNA.fasta.path.fish.fa
mafft.bat --thread 32 SSU_eukaryote_rRNA.fasta.path.fish.fa > SSU_eukaryote_rRNA.fasta.path.fish.fa.mafft

SSU ITS1 5.8S ITS2 LSUが完璧なもの(ヒトとか)のみ抽出

awk -F'\t' 'FILENAME==ARGV[1]{a[substr($0,2)]=1} FILENAME==ARGV[2]&&$1 in a{gsub(" ","_",$1); print ">"$1; print $2}' <(cat /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/ITS_eukaryote_sequences.fasta |grep "SSU ITS1 5.8S ITS2 LSU"|grep -v ">$"|grep -v "<") <(seqkit fx2tab /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/ITS_eukaryote_sequences.fasta) > ITS_eukaryote_sequences.fasta.complete.fa

/suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/run.sh

#!/bin/bash

N_CPU=8
maxgap=3000
mincov=0.8
minscore=3000

ref="$1"

makeblastdb -in $ref -dbtype nucl
blastn -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -db $ref -query /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/ITS_eukaryote_sequences.fasta.complete.fa -max_target_seqs 10000 -num_threads $N_CPU > $ref.its.blastn
cat $ref.its.blastn|sort -k14,14nr|
 awk 'NR==1{name=$1} $1==name{print $0}'|sort -k2,2 -k11,11n -k12,12n|
 awk -F'\t' -v maxgap=$maxgap 'BEGIN{
  OFS="\t";
 }
 {
  if($11<$12){o="+"}else{o="-"};
  if(oldqname==$1&&oldsname==$2&&oldo==o){
   if(o=="+"){deltaq=$9-oldq2; deltas=$11-olds2}else{deltaq=oldq1-$10; deltas=$12-olds1};
   str=deltaq"\t"deltas
  }
  if(!(oldqname==$1&&oldsname==$2&&oldo==o && deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap)){
   if(oldsname!=""){
    if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore}
    else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore};
   }
   str="na\tna"; deltaq=0; deltas=0; jointq1=0; joints1=0; jointq2=0; jointscore=0;
  };
  if(jointq1==0&&jointq2==0){
   if(o=="+"){jointq1=$9; joints1=$11}else{jointq2=$10; joints1=$12} #-の時はqは終わりから始まる
  };
  if(o=="+"){tempq2=$10; temps2=$12}else{tempq1=$9; temps2=$11};
  if(deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap){
   if(o=="+"){str2=o"\t"jointq1","$10","$10-jointq1","joints1","$12","$12-joints1}
   else{str2=o","$9","jointq2","jointq2-$9","joints1","$11","$11-joints1};
   print "joint::"str2
  };
  oldq1=$9; oldq2=$10; olds1=$11; olds2=$12; oldqname=$1; oldsname=$2; oldo=o; oldqlen=$3; oldslen=$4
  jointscore+=$14
  $13=o"\t"str"\t"$13;
  print $0;
 } END{
    if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore}
    else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore};
 }' > $ref.its.blastn.tab
cat $ref.its.blastn.tab|grep "^result:"|cut -f 2-|sort -k12nr|awk -F'\t' -v mincov=$mincov -v minscore=$minscore '$10>=$4*mincov&&$12>=minscore' |tee $ref.its.blastn.tab.result

seqkit fx2tab $ref |awk -F'\t' 'function revcomp(x,  i, b){b=""; for(i=length(x);i>0;i--){switch(toupper(substr(x,i,1))){case "A":b=b"T";break; case "C":b=b"G";break; case "G":b=b"C";break; case "T":b=b"A";break; default:b=b"N";break}}; return b} FILENAME==ARGV[1]{split($1,arr," "); seq[arr[1]]=$2} FILENAME==ARGV[2]{print ">"$2":"$8"-"$9":"$3; if($3=="+"){print substr(seq[$2],$8,$9-$8+1)}else{print revcomp(substr(seq[$2],$8,$9-$8+1))}}' /dev/stdin $ref.its.blastn.tab.result > $ref.its.blastn.tab.result.fasta

https://www.ncbi.nlm.nih.gov/genome/browse#!/eukaryotes/fishes

NCBI Genome DBでfishesと検索して出てきた結果を「Download」ボタンを押してダウンロード。「eukaryotes.csv」がダウンロードされる。

more eukaryotes.csv|awk '{flag=-1; str=""; for(i=1;i<=length($0);i++){t=substr($0,i,1); if(t=="\""){flag=-flag; str=str""t}else if(flag==1&&t==","){str=str"_"}else{str=str""t}}; print str}'|awk -F, '{if($15!=""){print $15}else if($16!=""){print $16}else{print NR,"error"}}'|tail -n+2|grep -v error$|sed 's/"//g; s/$/\//'|awk '{print "wget -t 0 -c -rl1 --accept=.fna.gz "$0}'|while read i; do $i; done
#途中で止まっているファイルが2%くらいあるので、もう一度実行して途中で止まったダウンロードをレジュームしておく。
#それでも変になっているファイルがあるので、zcatするとinvalid compressed data--format violatedと出てくるエラーメッセージを集めて下記のコマンドで再度ダウンロード
more error.invalid.list |awk '{print $2}'|sed 's/:$//'|sed '/^$/d'|awk '{a=$0; sub("/suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/","",a); print "rm -f "$0"; wget http://"a}'|xargs -I{} bash -c "{}"
more error.invalid.list |awk '{print $2}'|sed 's/:$//'|sed '/^$/d'|sort|uniq|awk -F/ '{print "mv "$NF" "$0}'|xargs -I{} bash -c "{}"

ssu~lsuを抽出

find ftp.ncbi.nlm.nih.gov/|grep _genomic.fna.gz|grep -v _from_genomic.fna.gz|while read i; do qsub -j y ~/qsubsh8 ./run-for-gz.sh $i; done
$ more run-for-gz.sh
#!/bin/bash

N_CPU=8
maxgap=3000
mincov=0.8
minscore=3000

resdir="$PWD/ssu-lsu"
mkdir -p "$resdir"
base=`basename $1 .gz`
mkdir /tmp/$base
zcat "$1" > /tmp/$base/$base
#ref="$1"
ref=/tmp/$base/$base

makeblastdb -in $ref -dbtype nucl
blastn -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -db $ref -query /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28
S/ITS_eukaryote_sequences.fasta.complete.fa -max_target_seqs 10000 -num_threads $N_CPU > $ref.its.blastn
cat $ref.its.blastn|sort -k14,14nr|
 awk 'NR==1{name=$1} $1==name{print $0}'|sort -k2,2 -k11,11n -k12,12n|
 awk -F'\t' -v maxgap=$maxgap 'BEGIN{
  OFS="\t";
 }
 {
  if($11<$12){o="+"}else{o="-"};
  if(oldqname==$1&&oldsname==$2&&oldo==o){
   if(o=="+"){deltaq=$9-oldq2; deltas=$11-olds2}else{deltaq=oldq1-$10; deltas=$12-olds1};
   str=deltaq"\t"deltas
  }
  if(!(oldqname==$1&&oldsname==$2&&oldo==o && deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap)){
   if(oldsname!=""){
    if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore}
    else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore};
   }
   str="na\tna"; deltaq=0; deltas=0; jointq1=0; joints1=0; jointq2=0; jointscore=0;
  };
  if(jointq1==0&&jointq2==0){
   if(o=="+"){jointq1=$9; joints1=$11}else{jointq2=$10; joints1=$12} #-の時はqは終わりから始まる
  };
  if(o=="+"){tempq2=$10; temps2=$12}else{tempq1=$9; temps2=$11};
  if(deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap){
   if(o=="+"){str2=o"\t"jointq1","$10","$10-jointq1","joints1","$12","$12-joints1}
   else{str2=o","$9","jointq2","jointq2-$9","joints1","$11","$11-joints1};
   print "joint::"str2
  };
  oldq1=$9; oldq2=$10; olds1=$11; olds2=$12; oldqname=$1; oldsname=$2; oldo=o; oldqlen=$3; oldslen=$4
  jointscore+=$14
  $13=o"\t"str"\t"$13;
  print $0;
 } END{
    if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore}
    else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore};
 }' > $ref.its.blastn.tab
cat $ref.its.blastn.tab|grep "^result:"|cut -f 2-|sort -k12nr|awk -F'\t' -v mincov=$mincov -v minscore=$minscore '$10>=$4*mincov&&$12>=minscore' |tee $ref.its.blastn.tab.result

seqkit fx2tab $ref |awk -F'\t' 'function revcomp(x,  i, b){b=""; for(i=length(x);i>0;i--){switch(toupper(substr(x,i,1))){case "A":b=b"T";break; case "C":b=b"G";break; case "G":b=b"C";break; case "T":b
=b"A";break; default:b=b"N";break}}; return b} FILENAME==ARGV[1]{split($1,arr," "); seq[arr[1]]=$2} FILENAME==ARGV[2]{print ">"$2":"$8"-"$9":"$3; if($3=="+"){print substr(seq[$2],$8,$9-$8+1)}else{prin
t revcomp(substr(seq[$2],$8,$9-$8+1))}}' /dev/stdin $ref.its.blastn.tab.result > $ref.its.blastn.tab.result.fasta

mv $ref.its.blastn.tab.result.fasta $ref.its.blastn "$resdir"/
rm -rf /tmp/$base

taxonomy pathを取得

more eukaryotes.csv |tail -n+2|awk '{flag=-1; str=""; for(i=1;i<=length($0);i++){t=substr($0,i,1); if(t=="\""){flag=-flag; str=str""t}else if(flag==1&&t==","){str=str"_"}else{str=str""t}}; print str}'|cut -f 1,6 -d,|sed 's/^"//; s/"$//; s/","/\t/'|sed "s/'//g"|awk -F'\t' 'FILENAME==ARGV[1]{a[$3]=$1} FILENAME==ARGV[2]{path[$1]=$2} FILENAME==ARGV[3]{print $2"\t"$1"\t"a[$1]"\t"path[a[$1]]}' <(cat /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname|sed "s/'//g") /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > eukaryotes.csv.path

新規に追加できた種数を確認(1種につき、ゲノムは複数存在するため)

for i in ssu-lsu/*.fasta; do if [ -s "$i" ]; then basename $i|sed 's/[.].*//'; fi; done|awk -F'\t' 'FILENAME==ARGV[1]{OFS="\t"; split($1,arr,"."); a[arr[1]]=$4} FILENAME==ARGV[2]{print $1"\t"a[$1]}' eukaryotes.csv.path /dev/stdin |cut -f 2|sort|uniq|wc -l
find /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/ftp.ncbi.nlm.nih.gov/|grep _genomic.fna.gz|grep -v _from_genomic.fna.gz|while read i ; do j=`basename $i _genomic.fna.gz|cut -f 1 -d.`; zcat $i|sed 's/^>/>'"$j"'__/'; done > fishes.all.genome.fa
/suikou/tool/ncbi-blast-2.13.0+/bin/makeblastdb -in fishes.all.genome.fa -dbtype nucl -parse_seqids -hash_index

/suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db "fishes.all.genome.fa /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/SSU_LSU_ITS_mito_plastid.maskadapters.desc.fa /suikou/db/ncbi/2021-05-24_nt_v5_formal/nt" -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 40 > DRR086650.fasta.blastn

#下記のエラー
Error: NCBI C++ Exception:
    T0 "/home/coremake/release_build/build/PrepareRelease_Linux64-Centos_JSID_01_660219_130.14.18.128_9008__PrepareRelease_Linux64-Centos_1643834072/c++/compilers/unix/../../src/corelib/ncbiobj.cpp", line 992: Critical: (CCoreException::eNullPtr) ncbi::CObject::ThrowNullPointerException() - Attempt to access NULL pointer.
    T0 "/home/coremake/release_build/build/PrepareRelease_Linux64-Centos_JSID_01_660219_130.14.18.128_9008__PrepareRelease_Linux64-Centos_1643834072/c++/compilers/unix/../../src/app/blast/blast_app_util.cpp", line 773: Critical: (CCoreException::eNullPtr) BLAST::ncbi::BlastFormatter_PreFetchSequenceData() - Error pre-fetching sequence data
     Stack trace:
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::CObject::ThrowNullPointerException() offset=0x9C addr=0x16d38fc
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn() [0x122c5fc] offset=0x0 addr=0x122c5fc
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::objects::CBioseq_Handle::IsSetInst_Length() const offset=0x9 addr=0x122c919
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::objects::CBioseq_Handle::GetBioseqLength() const offset=0xE addr=0x122ccce
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::blast::LoadSequencesToScope(std::vector<ncbi::objects::CSeq_id_Handle, std::allocator<ncbi::objects::CSeq_id_Handle> >&, std::vector<ncbi::CRange<unsigned int>, std::allocator<ncbi::CRange<unsigned int> > >&, ncbi::CRef<ncbi::objects::CScope, ncbi::CObjectCounterLocker>&) offset=0x639 addr=0xd26399
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::BlastFormatter_PreFetchSequenceData(ncbi::blast::CSearchResultSet const&, ncbi::CRef<ncbi::objects::CScope, ncbi::CObjectCounterLocker>, ncbi::blast::CFormattingArgs::EOutputFormat) offset=0x89F addr=0xa065bf
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 CBlastnApp::x_RunMTBySplitDB() offset=0x1B41 addr=0x9fbd91
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 CBlastnApp::Run() offset=0x34 addr=0x9fe7a4
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::CNcbiApplicationAPI::x_TryMain(ncbi::EAppDiagStream, char const*, int*, bool*) offset=0x13B addr=0x16274fb
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::CNcbiApplicationAPI::AppMain(int, char const* const*, char const* const*, ncbi::EAppDiagStream, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) offset=0x6FD addr=0x162af2d
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 main offset=0x69 addr=0x9c4079
      /lib64/libc.so.6 ???:0 __libc_start_main offset=0xF5 addr=0x7f4c31307b35
      /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn() [0x9f0db3] offset=0x0 addr=0x9f0db3

#一つずつblastをかけることに
for i in `seq -w 0 99` `seq 100 108`; do echo $i; time /suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db fishes.all.genome.fa.$i  -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 40 > DRR086650.fasta.blastn.fish.$i; done
#taxonomy path追加
cat DRR086650.fasta.blastn.fish.*|awk -F'\t' 'FILENAME==ARGV[1]{split($1,arr,"."); path[arr[1]]=$4} FILENAME==ARGV[2]{split($2,arr,"__"); print $0"\t"path[arr[1]]}' eukaryotes.csv.path /dev/stdin > DRR086650.fasta.blastn-fish-all

/suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db /suikou/db/ncbi/2021-05-24_nt_v5_formal/nt -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 32 > DRR086650.fasta.blastn.nt
cat DRR086650.fasta.blastn.nt|awk -F'\t' 'FILENAME==ARGV[1]{split($2,arr,"|"); split(arr[4],arr2,"."); a[arr2[1]]=1} FILENAME==ARGV[2]&& $1 in a{acc2taxid[$1]=$3} FILENAME==ARGV[3]{taxid2path[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"|"); split(arr[4],arr2,"."); print $0"\t"taxid2path[acc2taxid[arr2[1]]]}' DRR086650.fasta.blastn.nt <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > DRR086650.fasta.blastn.nt.path

/suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/SSU_LSU_ITS_mito_plastid.maskadapters.desc.fa -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 32 > DRR086650.fasta.blastn.ssu
cat DRR086650.fasta.blastn.ssu|awk -F'\t' 'FILENAME==ARGV[1]{split($2,arr,"|"); split(arr[2],arr2,"."); a[arr2[1]]=1} FILENAME==ARGV[2]&& $1 in a{acc2taxid[$1]=$3} FILENAME==ARGV[3]{taxid2path[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"|"); split(arr[2],arr2,"."); print $0"\t"taxid2path[acc2taxid[arr2[1]]]}' DRR086650.fasta.blastn.ssu <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > DRR086650.fasta.blastn.ssu.path

/suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db /suikou/db/pr2/v4.14.0/pr2_version_4.14.0_SSU_taxo_long.fasta -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 16 > DRR086650.fasta.blastn.pr2
cat DRR086650.fasta.blastn.pr2|awk -F'\t' 'FILENAME==ARGV[1]{split($2,arr,"."); a[arr[1]]=1} FILENAME==ARGV[2]&& $1 in a{acc2taxid[$1]=$3} FILENAME==ARGV[3]{taxid2path[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"."); print $0"\t"taxid2path[acc2taxid[arr[1]]]}' DRR086650.fasta.blastn.pr2 <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > DRR086650.fasta.blastn.pr2.path

cat DRR086650.fasta.blastn-fish-all DRR086650.fasta.blastn.nt.path DRR086650.fasta.blastn.ssu.path DRR086650.fasta.blastn.pr2.path|sort -k1,1V -k14,14nr --parallel 40 -S 100G > DRR086650.fasta.all.blastn

#LCA
awk -F'\t' -v percent=0.99 '
#検索用の関数 関数内でローカル変数を使用するために、関数内で使用する全変数を引数の後ろ側につけておく。これはawkのいけてない仕様だそう。
function searchLCA(data,  i, j, res, res2, str, n, stopflag){
 #同じリードのデータを全て検索して共通していないパスを空白に置換しておく。
 for(i in data){
  if(n==0){n=split(i,res,";")}
  else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}}
 }
 if(res[1]!=""){str=res[1]}else{str="unknown"; stopflag=1};
 for(i=2;i<=n;i++){if(stopflag==0 && res[i]!=""){str=str";"res[i]}else{stopflag=1}}
 return str;
}
{
 #新しいリードに変わったら、いろんな変数を初期化しておく
 if($1!=old){if($17!=""){if(old!=""){print searchLCA(data)"\t"oldstr}; delete data; basescore=$14; data[$17]=1; old=$1; oldstr=$0}}
 #前の行と同じリードで、閾値以上(bitscoreがトップヒットの99%以上)であればデータ配列に突っ込む
 else if($14>=basescore*percent && $17!=""){data[$17]=1}
}
END{print searchLCA(data)"\t"oldstr}' DRR086650.fasta.all.blastn > DRR086650.fasta.all.blastn.lca
wget https://github.com/pr2database/pr2database/releases/download/v4.14.0/pr2_version_4.14.0_SSU_taxo_long.fasta.gz

more pr2_version_4.14.0_SSU_taxo_long.fasta|grep ">"|sed 's/^>//'|awk -F. '{print $1"\t"$0}'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=$2} FILENAME==ARGV[2]{path[$1]=$2} FILENAME==ARGV[3] && $1 in a{print a[$1]"\t"path[$3]}' /dev/stdin /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) > pr2_version_4.14.0_SSU_taxo_long.fasta.path
$ more pr2_version_4.14.0_SSU_taxo_long.fasta|grep ">"|cut -f 2,3 -d "|"|sort|uniq -c |sort -nr
 183237 18S_rRNA|nucleus
   7968 16S_rRNA|
   6350 16S_rRNA|plastid
     22 16S_rRNA|apicoplast
     16 16S_rRNA|mitochondrion
      7 16S_rRNA|chromatophore
      2 18S_rRNA|nucleomorph
  • blast_db検証.1655558458.txt.gz
  • 最終更新: 2022/06/18 13:20
  • by 118.240.79.152