**文書の過去の版を表示しています。**
accession→taxonomy pathを付与
awk -F'\t' 'FILENAME==ARGV[1]{split($1,arr,"."); a[arr[1]]=$0} FILENAME==ARGV[2] && $1 in a{ac2tax[$1]=$3} FILENAME==ARGV[3]{tax2name[$1]=$2} END{for(i in a){print a[i]"\t"tax2name[ac2tax[i]]}}' <(seqkit fx2tab SSU_eukaryote_rRNA.fasta) <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) /suikou/db/ncbi/2021-06-01_taxdump/names.dmp.sname.path > SSU_eukaryote_rRNA.fasta.path
mafftで魚類だけアライメント
軟骨魚類:root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes 硬骨魚類:root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Teleostomi;Euteleostomi;Actinopterygii more SSU_eukaryote_rRNA.fasta.path|egrep "(root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Chondrichthyes|root;cellular organisms;Eukaryota;Opisthokonta;Metazoa;Eumetazoa;Bilateria;Deuterostomia;Chordata;Craniata;Vertebrata;Gnathostomata;Teleostomi;Euteleostomi;Actinopterygii)"|awk -F'\t' '{OFS="\t"; gsub(" ","_",$1); print ">"$1; print $2}' > SSU_eukaryote_rRNA.fasta.path.fish.fa mafft.bat --thread 32 SSU_eukaryote_rRNA.fasta.path.fish.fa > SSU_eukaryote_rRNA.fasta.path.fish.fa.mafft
SSU ITS1 5.8S ITS2 LSUが完璧なもの(ヒトとか)のみ抽出
awk -F'\t' 'FILENAME==ARGV[1]{a[substr($0,2)]=1} FILENAME==ARGV[2]&&$1 in a{gsub(" ","_",$1); print ">"$1; print $2}' <(cat /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/ITS_eukaryote_sequences.fasta |grep "SSU ITS1 5.8S ITS2 LSU"|grep -v ">$"|grep -v "<") <(seqkit fx2tab /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/ITS_eukaryote_sequences.fasta) > ITS_eukaryote_sequences.fasta.complete.fa
SSU-LSUをblast結果から抜き出す
/suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/run.sh
#!/bin/bash N_CPU=8 maxgap=3000 mincov=0.8 minscore=3000 ref="$1" makeblastdb -in $ref -dbtype nucl blastn -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -db $ref -query /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/ITS_eukaryote_sequences.fasta.complete.fa -max_target_seqs 10000 -num_threads $N_CPU > $ref.its.blastn cat $ref.its.blastn|sort -k14,14nr| awk 'NR==1{name=$1} $1==name{print $0}'|sort -k2,2 -k11,11n -k12,12n| awk -F'\t' -v maxgap=$maxgap 'BEGIN{ OFS="\t"; } { if($11<$12){o="+"}else{o="-"}; if(oldqname==$1&&oldsname==$2&&oldo==o){ if(o=="+"){deltaq=$9-oldq2; deltas=$11-olds2}else{deltaq=oldq1-$10; deltas=$12-olds1}; str=deltaq"\t"deltas } if(!(oldqname==$1&&oldsname==$2&&oldo==o && deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap)){ if(oldsname!=""){ if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; } str="na\tna"; deltaq=0; deltas=0; jointq1=0; joints1=0; jointq2=0; jointscore=0; }; if(jointq1==0&&jointq2==0){ if(o=="+"){jointq1=$9; joints1=$11}else{jointq2=$10; joints1=$12} #-の時はqは終わりから始まる }; if(o=="+"){tempq2=$10; temps2=$12}else{tempq1=$9; temps2=$11}; if(deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap){ if(o=="+"){str2=o"\t"jointq1","$10","$10-jointq1","joints1","$12","$12-joints1} else{str2=o","$9","jointq2","jointq2-$9","joints1","$11","$11-joints1}; print "joint::"str2 }; oldq1=$9; oldq2=$10; olds1=$11; olds2=$12; oldqname=$1; oldsname=$2; oldo=o; oldqlen=$3; oldslen=$4 jointscore+=$14 $13=o"\t"str"\t"$13; print $0; } END{ if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; }' > $ref.its.blastn.tab cat $ref.its.blastn.tab|grep "^result:"|cut -f 2-|sort -k12nr|awk -F'\t' -v mincov=$mincov -v minscore=$minscore '$10>=$4*mincov&&$12>=minscore' |tee $ref.its.blastn.tab.result seqkit fx2tab $ref |awk -F'\t' 'function revcomp(x, i, b){b=""; for(i=length(x);i>0;i--){switch(toupper(substr(x,i,1))){case "A":b=b"T";break; case "C":b=b"G";break; case "G":b=b"C";break; case "T":b=b"A";break; default:b=b"N";break}}; return b} FILENAME==ARGV[1]{split($1,arr," "); seq[arr[1]]=$2} FILENAME==ARGV[2]{print ">"$2":"$8"-"$9":"$3; if($3=="+"){print substr(seq[$2],$8,$9-$8+1)}else{print revcomp(substr(seq[$2],$8,$9-$8+1))}}' /dev/stdin $ref.its.blastn.tab.result > $ref.its.blastn.tab.result.fasta
ゲノムFASTAファイル一括ダウンロードしてssu~lsuを抽出
https://www.ncbi.nlm.nih.gov/genome/browse#!/eukaryotes/fishes
NCBI Genome DBでfishesと検索して出てきた結果を「Download」ボタンを押してダウンロード。「eukaryotes.csv」がダウンロードされる。
more eukaryotes.csv|awk '{flag=-1; str=""; for(i=1;i<=length($0);i++){t=substr($0,i,1); if(t=="\""){flag=-flag; str=str""t}else if(flag==1&&t==","){str=str"_"}else{str=str""t}}; print str}'|awk -F, '{if($15!=""){print $15}else if($16!=""){print $16}else{print NR,"error"}}'|tail -n+2|grep -v error$|sed 's/"//g; s/$/\//'|awk '{print "wget -t 0 -c -rl1 --accept=.fna.gz "$0}'|while read i; do $i; done #途中で止まっているファイルが2%くらいあるので、もう一度実行して途中で止まったダウンロードをレジュームしておく。 #それでも変になっているファイルがあるので、zcatするとinvalid compressed data--format violatedと出てくるエラーメッセージを集めて下記のコマンドで再度ダウンロード more error.invalid.list |awk '{print $2}'|sed 's/:$//'|sed '/^$/d'|awk '{a=$0; sub("/suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/","",a); print "rm -f "$0"; wget http://"a}'|xargs -I{} bash -c "{}" more error.invalid.list |awk '{print $2}'|sed 's/:$//'|sed '/^$/d'|sort|uniq|awk -F/ '{print "mv "$NF" "$0}'|xargs -I{} bash -c "{}"
ssu~lsuを抽出
find ftp.ncbi.nlm.nih.gov/|grep _genomic.fna.gz|grep -v _from_genomic.fna.gz|while read i; do qsub -j y ~/qsubsh8 ./run-for-gz.sh $i; done
$ more run-for-gz.sh #!/bin/bash N_CPU=8 maxgap=3000 mincov=0.8 minscore=3000 resdir="$PWD/ssu-lsu" mkdir -p "$resdir" base=`basename $1 .gz` mkdir /tmp/$base zcat "$1" > /tmp/$base/$base #ref="$1" ref=/tmp/$base/$base makeblastdb -in $ref -dbtype nucl blastn -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -db $ref -query /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28 S/ITS_eukaryote_sequences.fasta.complete.fa -max_target_seqs 10000 -num_threads $N_CPU > $ref.its.blastn cat $ref.its.blastn|sort -k14,14nr| awk 'NR==1{name=$1} $1==name{print $0}'|sort -k2,2 -k11,11n -k12,12n| awk -F'\t' -v maxgap=$maxgap 'BEGIN{ OFS="\t"; } { if($11<$12){o="+"}else{o="-"}; if(oldqname==$1&&oldsname==$2&&oldo==o){ if(o=="+"){deltaq=$9-oldq2; deltas=$11-olds2}else{deltaq=oldq1-$10; deltas=$12-olds1}; str=deltaq"\t"deltas } if(!(oldqname==$1&&oldsname==$2&&oldo==o && deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap)){ if(oldsname!=""){ if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; } str="na\tna"; deltaq=0; deltas=0; jointq1=0; joints1=0; jointq2=0; jointscore=0; }; if(jointq1==0&&jointq2==0){ if(o=="+"){jointq1=$9; joints1=$11}else{jointq2=$10; joints1=$12} #-の時はqは終わりから始まる }; if(o=="+"){tempq2=$10; temps2=$12}else{tempq1=$9; temps2=$11}; if(deltaq>0&&deltaq<=maxgap&&deltas>0&&deltas<=maxgap){ if(o=="+"){str2=o"\t"jointq1","$10","$10-jointq1","joints1","$12","$12-joints1} else{str2=o","$9","jointq2","jointq2-$9","joints1","$11","$11-joints1}; print "joint::"str2 }; oldq1=$9; oldq2=$10; olds1=$11; olds2=$12; oldqname=$1; oldsname=$2; oldo=o; oldqlen=$3; oldslen=$4 jointscore+=$14 $13=o"\t"str"\t"$13; print $0; } END{ if(oldo=="+"){print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"jointq1"\t"tempq2"\t"joints1"\t"temps2"\t"tempq2-jointq1+1"\t"temps2-joints1+1"\t"jointscore} else{print "result:\t"oldqname"\t"oldsname"\t"oldo"\t"oldqlen"\t"oldslen"\t"tempq1"\t"jointq2"\t"joints1"\t"temps2"\t"jointq2-tempq1+1"\t"temps2-joints1+1"\t"jointscore}; }' > $ref.its.blastn.tab cat $ref.its.blastn.tab|grep "^result:"|cut -f 2-|sort -k12nr|awk -F'\t' -v mincov=$mincov -v minscore=$minscore '$10>=$4*mincov&&$12>=minscore' |tee $ref.its.blastn.tab.result seqkit fx2tab $ref |awk -F'\t' 'function revcomp(x, i, b){b=""; for(i=length(x);i>0;i--){switch(toupper(substr(x,i,1))){case "A":b=b"T";break; case "C":b=b"G";break; case "G":b=b"C";break; case "T":b =b"A";break; default:b=b"N";break}}; return b} FILENAME==ARGV[1]{split($1,arr," "); seq[arr[1]]=$2} FILENAME==ARGV[2]{print ">"$2":"$8"-"$9":"$3; if($3=="+"){print substr(seq[$2],$8,$9-$8+1)}else{prin t revcomp(substr(seq[$2],$8,$9-$8+1))}}' /dev/stdin $ref.its.blastn.tab.result > $ref.its.blastn.tab.result.fasta mv $ref.its.blastn.tab.result.fasta $ref.its.blastn "$resdir"/ rm -rf /tmp/$base
taxonomy pathを取得
more eukaryotes.csv |tail -n+2|awk '{flag=-1; str=""; for(i=1;i<=length($0);i++){t=substr($0,i,1); if(t=="\""){flag=-flag; str=str""t}else if(flag==1&&t==","){str=str"_"}else{str=str""t}}; print str}'|cut -f 1,6 -d,|sed 's/^"//; s/"$//; s/","/\t/'|sed "s/'//g"|awk -F'\t' 'FILENAME==ARGV[1]{a[$3]=$1} FILENAME==ARGV[2]{path[$1]=$2} FILENAME==ARGV[3]{print $2"\t"$1"\t"a[$1]"\t"path[a[$1]]}' <(cat /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname|sed "s/'//g") /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path /dev/stdin > eukaryotes.csv.path
新規に追加できた種数を確認(1種につき、ゲノムは複数存在するため)
for i in ssu-lsu/*.fasta; do if [ -s "$i" ]; then basename $i|sed 's/[.].*//'; fi; done|awk -F'\t' 'FILENAME==ARGV[1]{OFS="\t"; split($1,arr,"."); a[arr[1]]=$4} FILENAME==ARGV[2]{print $1"\t"a[$1]}' eukaryotes.csv.path /dev/stdin |cut -f 2|sort|uniq|wc -l
Fishes全ゲノムデータベース
find /suikou/files/m1536/yoshitake.kazutoshi/work/fish-18S28S/ftp.ncbi.nlm.nih.gov/|grep _genomic.fna.gz|grep -v _from_genomic.fna.gz|while read i ; do j=`basename $i _genomic.fna.gz|cut -f 1 -d.`; zcat $i|sed 's/^>/>'"$j"'__/'; done > fishes.all.genome.fa /suikou/tool/ncbi-blast-2.13.0+/bin/makeblastdb -in fishes.all.genome.fa -dbtype nucl -parse_seqids -hash_index /suikou/tool/ncbi-blast-2.13.0+/bin/blastn -db "fishes.all.genome.fa /suikou/db/ncbi/2021-12-01_SSU_LSU_ITS_mito_plastid/SSU_LSU_ITS_mito_plastid.maskadapters.desc.fa /suikou/db/ncbi/2021-05-24_nt_v5_formal/nt" -query DRR086650.fasta -outfmt "6 qseqid sseqid qlen slen pident length mismatch gapopen qstart qend sstart send evalue bitscore staxids stitle" -num_threads 40 > DRR086650.fasta.blastn
PR2データベース検証
wget https://github.com/pr2database/pr2database/releases/download/v4.14.0/pr2_version_4.14.0_SSU_taxo_long.fasta.gz more pr2_version_4.14.0_SSU_taxo_long.fasta|grep ">"|sed 's/^>//'|awk -F. '{print $1"\t"$0}'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=$2} FILENAME==ARGV[2]{path[$1]=$2} FILENAME==ARGV[3] && $1 in a{print a[$1]"\t"path[$3]}' /dev/stdin /suikou/db/ncbi/2022-06-15_taxdump/names.dmp.sname.path <(zcat /suikou/db/ncbi/2022-04-10_accession2taxid/nucl_gb.accession2taxid.gz) > pr2_version_4.14.0_SSU_taxo_long.fasta.path
$ more pr2_version_4.14.0_SSU_taxo_long.fasta|grep ">"|cut -f 2,3 -d "|"|sort|uniq -c |sort -nr 183237 18S_rRNA|nucleus 7968 16S_rRNA| 6350 16S_rRNA|plastid 22 16S_rRNA|apicoplast 16 16S_rRNA|mitochondrion 7 16S_rRNA|chromatophore 2 18S_rRNA|nucleomorph