**文書の過去の版を表示しています。**
accession→taxonomy pathを付与
awk -F'\t' 'FILENAME==ARGV[1]{split($1,arr,"."); a[arr[1]]=$0} FILENAME==ARGV[2] && $1 in a{ac2tax[$1]=$3} FILENAME==ARGV[3]{tax2name[$1]=$2} END{for(i in a){print a[i]"\t"tax2name[ac2tax[i]]}}' <(seqkit fx2tab SSU_eukaryote_rRNA.fasta) <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2021-06-01_taxdump/names.dmp.sname.path > SSU_eukaryote_rRNA.fasta.path
mafftで魚類だけアライメント
軟骨魚類:root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes 硬骨魚類:root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Teleostomi;Euteleostomi;Actinopterygii more SSU_eukaryote_rRNA.fasta.path|egrep "(root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes|root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Teleostomi;Euteleostomi;Actinopterygii)"|awk -F'\t' '{OFS="\t"; gsub(" ","_",$1); print ">"$1; print $2}' > SSU_eukaryote_rRNA.fasta.path.fish.fa mafft.bat --thread 32 SSU_eukaryote_rRNA.fasta.path.fish.fa > SSU_eukaryote_rRNA.fasta.path.fish.fa.mafft
SSU ITS1 5.8S ITS2 LSUが完璧なもの(ヒトとか)のみ抽出
awk -F'\t' 'FILENAME==ARGV[1]{a[substr($0,2)]=1} FILENAME==ARGV[2]&&$1 in a{gsub(" ","_",$1); print ">"$1; print $2}' <(cat /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/ITS_eukaryote_sequences.fasta |grep "SSU ITS1 5.8S ITS2 LSU"|grep -v ">$"|grep -v "<") <(seqkit fx2tab /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/ITS_eukaryote_sequences.fasta) > ITS_eukaryote_sequences.fasta.complete.fa
SSU-LSUをblast結果から抜き出す
/suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/run.sh
#!/bin/bash N_CPU=8 maxgap=3000 mincov=0.8 minscore=3000 ref="$1" makeblastdb -in $ref -dbtype nucl blastn -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -db $ref -query /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/ITS_eukaryote_sequences.fasta.complete.fa -max_target_seqs 10000 -num_threads $N_CPU > $ref.its.blastn cat $ref.its.blastn|sort -k14,14nr| awk 'NR==1{name=$1} $1==name{print $0}'|sort -k2,2 -k11,11n -k12,12n| awk -F'\t' -v maxgap=$maxgap 'BEGIN{ OFS="\t"; } { if($11<$12){o="+"}else{o="-"}; if(oldqname==$1&&oldsname==$2&&oldo==o){ if(o=="+"){deltaq=$9-oldq2; deltas=$11-olds2}else{deltaq=oldq1-$10; deltas=$12-olds1}; str=deltaq"\t"deltas } if(!(oldqname==$1&&oldsname==$2&&oldo==o && deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap)){ if(oldsname!=""){ if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; } str="na\tna"; deltaq=0; deltas=0; jointq1=0; joints1=0; jointq2=0; jointscore=0; }; if(jointq1==0&&jointq2==0){ if(o=="+"){jointq1=$9; joints1=$11}else{jointq2=$10; joints1=$12} #-の時はqは終わりから始まる }; if(o=="+"){tempq2=$10; temps2=$12}else{tempq1=$9; temps2=$11}; if(deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap){ if(o=="+"){str2=o"\t"jointq1","$10","$10-jointq1","joints1","$12","$12-joints1} else{str2=o","$9","jointq2","jointq2-$9","joints1","$11","$11-joints1}; print "joint::"str2 }; oldq1=$9; oldq2=$10; olds1=$11; olds2=$12; oldqname=$1; oldsname=$2; oldo=o; oldqlen=$3; oldslen=$4 jointscore+=$14 $13=o"\t"str"\t"$13; print $0; } END{ if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; }' > $ref.its.blastn.tab cat $ref.its.blastn.tab|grep "^result:"|cut -f 2-|sort -k12nr|awk -F'\t' -v mincov=$mincov -v minscore=$minscore '$10>=$4*mincov&&$12>=minscore' |tee $ref.its.blastn.tab.result seqkit fx2tab $ref |awk -F'\t' 'function revcomp(x, i, b){b=""; for(i=length(x);i>0;i--){switch(toupper(substr(x,i,1))){case "A":b=b"T";break; case "C":b=b"G";break; case "G":b=b"C";break; case "T":b=b"A";break; default:b=b"N";break}}; return b} FILENAME==ARGV[1]{split($1,arr," "); seq[arr[1]]=$2} FILENAME==ARGV[2]{print ">"$2":"$8"-"$9":"$3; if($3=="+"){print substr(seq[$2],$8,$9-$8+1)}else{print revcomp(substr(seq[$2],$8,$9-$8+1))}}' /dev/stdin $ref.its.blastn.tab.result > $ref.its.blastn.tab.result.fasta
ゲノムFASTAファイル一括ダウンロードしてssu~lsuを抽出
https://www.ncbi.nlm.nih.gov/genome/browse#!/eukaryotes/fishes
NCBI Genome DBでfishesと検索して出てきた結果を「Download」ボタンを押してダウンロード。「eukaryotes.csv」がダウンロードされる。
more eukaryotes.csv|awk '{flag=-1; str=""; for(i=1;i<=length($0);i++){t=substr($0,i,1); if(t=="\""){flag=-flag; str=str""t}else if(flag==1&&t==","){str=str"_"}else{str=str""t}}; print str}'|awk -F, '{if($15!=""){print $15}else if($16!=""){print $16}else{print NR,"error"}}'|tail -n+2|grep -v error$|sed 's/"//g; s/$/\//'|awk '{print "wget -t 0 -c -rl1 --accept=.fna.gz "$0}'|while read i; do $i; done #途中で止まっているファイルが2%くらいあるので、もう一度実行して途中で止まったダウンロードをレジュームしておく。 #それでも変になっているファイルがあるので、zcatするとinvalid compressed data--format violatedと出てくるエラーメッセージを集めて下記のコマンドで再度ダウンロード more error.invalid.list |awk '{print $2}'|sed 's/:$//'|sed '/^$/d'|awk '{a=$0; sub("/suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/","",a); print "rm -f "$0"; wget http://"a}'|xargs -I{} bash -c "{}" more error.invalid.list |awk '{print $2}'|sed 's/:$//'|sed '/^$/d'|sort|uniq|awk -F/ '{print "mv "$NF" "$0}'|xargs -I{} bash -c "{}"
ssu~lsuを抽出
find ftp.ncbi.nlm.nih.gov/|grep _genomic.fna.gz|grep -v _from_genomic.fna.gz|while read i; do qsub -j y ~/qsubsh8 ./run-for-gz.sh $i; done
$ more run-for-gz.sh #!/bin/bash N_CPU=8 maxgap=3000 mincov=0.8 minscore=3000 resdir="$PWD/ssu-lsu" mkdir -p "$resdir" base=`basename $1 .gz` mkdir /tmp/$base zcat "$1" > /tmp/$base/$base #ref="$1" ref=/tmp/$base/$base makeblastdb -in $ref -dbtype nucl blastn -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -db $ref -query /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28 S/ITS_eukaryote_sequences.fasta.complete.fa -max_target_seqs 10000 -num_threads $N_CPU > $ref.its.blastn cat $ref.its.blastn|sort -k14,14nr| awk 'NR==1{name=$1} $1==name{print $0}'|sort -k2,2 -k11,11n -k12,12n| awk -F'\t' -v maxgap=$maxgap 'BEGIN{ OFS="\t"; } { if($11<$12){o="+"}else{o="-"}; if(oldqname==$1&&oldsname==$2&&oldo==o){ if(o=="+"){deltaq=$9-oldq2; deltas=$11-olds2}else{deltaq=oldq1-$10; deltas=$12-olds1}; str=deltaq"\t"deltas } if(!(oldqname==$1&&oldsname==$2&&oldo==o && deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap)){ if(oldsname!=""){ if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; } str="na\tna"; deltaq=0; deltas=0; jointq1=0; joints1=0; jointq2=0; jointscore=0; }; if(jointq1==0&&jointq2==0){ if(o=="+"){jointq1=$9; joints1=$11}else{jointq2=$10; joints1=$12} #-の時はqは終わりから始まる }; if(o=="+"){tempq2=$10; temps2=$12}else{tempq1=$9; temps2=$11}; if(deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap){ if(o=="+"){str2=o"\t"jointq1","$10","$10-jointq1","joints1","$12","$12-joints1} else{str2=o","$9","jointq2","jointq2-$9","joints1","$11","$11-joints1}; print "joint::"str2 }; oldq1=$9; oldq2=$10; olds1=$11; olds2=$12; oldqname=$1; oldsname=$2; oldo=o; oldqlen=$3; oldslen=$4 jointscore+=$14 $13=o"\t"str"\t"$13; print $0; } END{ if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; }' > $ref.its.blastn.tab cat $ref.its.blastn.tab|grep "^result:"|cut -f 2-|sort -k12nr|awk -F'\t' -v mincov=$mincov -v minscore=$minscore '$10>=$4*mincov&&$12>=minscore' |tee $ref.its.blastn.tab.result seqkit fx2tab $ref |awk -F'\t' 'function revcomp(x, i, b){b=""; for(i=length(x);i>0;i--){switch(toupper(substr(x,i,1))){case "A":b=b"T";break; case "C":b=b"G";break; case "G":b=b"C";break; case "T":b =b"A";break; default:b=b"N";break}}; return b} FILENAME==ARGV[1]{split($1,arr," "); seq[arr[1]]=$2} FILENAME==ARGV[2]{print ">"$2":"$8"-"$9":"$3; if($3=="+"){print substr(seq[$2],$8,$9-$8+1)}else{prin t revcomp(substr(seq[$2],$8,$9-$8+1))}}' /dev/stdin $ref.its.blastn.tab.result > $ref.its.blastn.tab.result.fasta mv $ref.its.blastn.tab.result.fasta $ref.its.blastn "$resdir"/ rm -rf /tmp/$base
taxonomy pathを取得
more eukaryotes.csv |tail -n+2|awk '{flag=-1; str=""; for(i=1;i<=length($0);i++){t=substr($0,i,1); if(t=="\""){flag=-flag; str=str""t}else if(flag==1&&t==","){str=str"_"}else{str=str""t}}; print str}'|cut -f 1,6 -d,|sed 's/^"//; s/"$//; s/","/\t/'|sed "s/'//g"|awk -F'\t' 'FILENAME==ARGV[1]{a[$3]=$1} FILENAME==ARGV[2]{path[$1]=$2} FILENAME==ARGV[3]{print $2"\t"$1"\t"a[$1]"\t"path[a[$1]]}' <(cat /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname|sed "s/'//g") /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > eukaryotes.csv.path
新規に追加できた種数を確認(1種につき、ゲノムは複数存在するため)
for i in ssu-lsu/*.fasta; do if [ -s "$i" ]; then basename $i|sed 's/[.].*//'; fi; done|awk -F'\t' 'FILENAME==ARGV[1]{OFS="\t"; split($1,arr,"."); a[arr[1]]=$4} FILENAME==ARGV[2]{print $1"\t"a[$1]}' eukaryotes.csv.path /dev/stdin |cut -f 2|sort|uniq|wc -l
Fishes全ゲノムデータベース
find /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/ftp.ncbi.nlm.nih.gov/|grep _genomic.fna.gz|grep -v _from_genomic.fna.gz|while read i ; do j=`basename $i _genomic.fna.gz|cut -f 1 -d.`; zcat $i|sed 's/^>/>'"$j"'__/'; done > fishes.all.genome.fa /suikou/tool/ncbi-blast-2.13.0+/bin/makeblastdb -in fishes.all.genome.fa -dbtype nucl -parse_seqids -hash_index /suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db "fishes.all.genome.fa /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/SSU_LSU_ITS_mito_plastid.maskadapters.desc.fa /suikou/db/ncbi/2021-05-24_nt_v5_formal/nt" -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 40 > DRR086650.fasta.blastn #下記のエラー Error: NCBI C++ Exception: T0 "/home/coremake/release_build/build/PrepareRelease_Linux64-Centos_JSID_01_660219_130.14.18.128_9008__PrepareRelease_Linux64-Centos_1643834072/c++/compilers/unix/../../src/corelib/ncbiobj.cpp", line 992: Critical: (CCoreException::eNullPtr) ncbi::CObject::ThrowNullPointerException() - Attempt to access NULL pointer. T0 "/home/coremake/release_build/build/PrepareRelease_Linux64-Centos_JSID_01_660219_130.14.18.128_9008__PrepareRelease_Linux64-Centos_1643834072/c++/compilers/unix/../../src/app/blast/blast_app_util.cpp", line 773: Critical: (CCoreException::eNullPtr) BLAST::ncbi::BlastFormatter_PreFetchSequenceData() - Error pre-fetching sequence data Stack trace: /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::CObject::ThrowNullPointerException() offset=0x9C addr=0x16d38fc /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn() [0x122c5fc] offset=0x0 addr=0x122c5fc /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::objects::CBioseq_Handle::IsSetInst_Length() const offset=0x9 addr=0x122c919 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::objects::CBioseq_Handle::GetBioseqLength() const offset=0xE addr=0x122ccce /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::blast::LoadSequencesToScope(std::vector<ncbi::objects::CSeq_id_Handle, std::allocator<ncbi::objects::CSeq_id_Handle> >&, std::vector<ncbi::CRange<unsigned int>, std::allocator<ncbi::CRange<unsigned int> > >&, ncbi::CRef<ncbi::objects::CScope, ncbi::CObjectCounterLocker>&) offset=0x639 addr=0xd26399 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::BlastFormatter_PreFetchSequenceData(ncbi::blast::CSearchResultSet const&, ncbi::CRef<ncbi::objects::CScope, ncbi::CObjectCounterLocker>, ncbi::blast::CFormattingArgs::EOutputFormat) offset=0x89F addr=0xa065bf /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 CBlastnApp::x_RunMTBySplitDB() offset=0x1B41 addr=0x9fbd91 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 CBlastnApp::Run() offset=0x34 addr=0x9fe7a4 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::CNcbiApplicationAPI::x_TryMain(ncbi::EAppDiagStream, char const*, int*, bool*) offset=0x13B addr=0x16274fb /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 ncbi::CNcbiApplicationAPI::AppMain(int, char const* const*, char const* const*, ncbi::EAppDiagStream, char const*, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&) offset=0x6FD addr=0x162af2d /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 main offset=0x69 addr=0x9c4079 /lib64/libc.so.6 ???:0 __libc_start_main offset=0xF5 addr=0x7f4c31307b35 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn ???:0 /suikou/tool/ncbi-blast-2.13.0+/bin/blastn() [0x9f0db3] offset=0x0 addr=0x9f0db3 #一つずつblastをかけることに for i in `seq -w 0 99` `seq 100 108`; do echo $i; time /suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db fishes.all.genome.fa.$i -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 40 > DRR086650.fasta.blastn.fish.$i; done #taxonomy path追加 cat DRR086650.fasta.blastn.fish.*|awk -F'\t' 'FILENAME==ARGV[1]{split($1,arr,"."); path[arr[1]]=$4} FILENAME==ARGV[2]{split($2,arr,"__"); print $0"\t"path[arr[1]]}' eukaryotes.csv.path /dev/stdin > DRR086650.fasta.blastn-fish-all /suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db /suikou/db/ncbi/2021-05-24_nt_v5_formal/nt -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 32 > DRR086650.fasta.blastn.nt cat DRR086650.fasta.blastn.nt|awk -F'\t' 'FILENAME==ARGV[1]{split($2,arr,"|"); split(arr[4],arr2,"."); a[arr2[1]]=1} FILENAME==ARGV[2]&& $1 in a{acc2taxid[$1]=$3} FILENAME==ARGV[3]{taxid2path[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"|"); split(arr[4],arr2,"."); print $0"\t"taxid2path[acc2taxid[arr2[1]]]}' DRR086650.fasta.blastn.nt <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > DRR086650.fasta.blastn.nt.path /suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/SSU_LSU_ITS_mito_plastid.maskadapters.desc.fa -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 32 > DRR086650.fasta.blastn.ssu cat DRR086650.fasta.blastn.ssu|awk -F'\t' 'FILENAME==ARGV[1]{split($2,arr,"|"); split(arr[2],arr2,"."); a[arr2[1]]=1} FILENAME==ARGV[2]&& $1 in a{acc2taxid[$1]=$3} FILENAME==ARGV[3]{taxid2path[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"|"); split(arr[2],arr2,"."); print $0"\t"taxid2path[acc2taxid[arr2[1]]]}' DRR086650.fasta.blastn.ssu <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > DRR086650.fasta.blastn.ssu.path /suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db /suikou/db/pr2/v4.14.0/pr2_version_4.14.0_SSU_taxo_long.fasta -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 16 > DRR086650.fasta.blastn.pr2 cat DRR086650.fasta.blastn.pr2|awk -F'\t' 'FILENAME==ARGV[1]{split($2,arr,"."); a[arr[1]]=1} FILENAME==ARGV[2]&& $1 in a{acc2taxid[$1]=$3} FILENAME==ARGV[3]{taxid2path[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"."); print $0"\t"taxid2path[acc2taxid[arr[1]]]}' DRR086650.fasta.blastn.pr2 <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > DRR086650.fasta.blastn.pr2.path cat DRR086650.fasta.blastn-fish-all DRR086650.fasta.blastn.nt.path DRR086650.fasta.blastn.ssu.path DRR086650.fasta.blastn.pr2.path|sort -k1,1V -k14,14nr --parallel 40 -S 10G > DRR086650.fasta.all.blastn
PR2データベース検証
wget https://github.com/pr2database/pr2database/releases/download/v4.14.0/pr2_version_4.14.0_SSU_taxo_long.fasta.gz more pr2_version_4.14.0_SSU_taxo_long.fasta|grep ">"|sed 's/^>//'|awk -F. '{print $1"\t"$0}'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=$2} FILENAME==ARGV[2]{path[$1]=$2} FILENAME==ARGV[3] && $1 in a{print a[$1]"\t"path[$3]}' /dev/stdin /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) > pr2_version_4.14.0_SSU_taxo_long.fasta.path
$ more pr2_version_4.14.0_SSU_taxo_long.fasta|grep ">"|cut -f 2,3 -d "|"|sort|uniq -c |sort -nr 183237 18S_rRNA|nucleus 7968 16S_rRNA| 6350 16S_rRNA|plastid 22 16S_rRNA|apicoplast 16 16S_rRNA|mitochondrion 7 16S_rRNA|chromatophore 2 18S_rRNA|nucleomorph