**文書の過去の版を表示しています。**
パブリックデータの解析が終わったら 2024 v2
#m32sで mkdir /home/yoshitake.kazutoshi/work3/metasearch/2023/db5-2-merge cd /home/yoshitake.kazutoshi/work3/metasearch/2023/db4-2-merge for i in `ls -d ../db5/[1-9]*|grep -v "[.]tar"`;do echo $i; j=`basename $i`; mkdir $j; cp $i/*.result $j; done for i in `ls -d ../db5-2/[1-9]*|grep -v "[.]tar"`;do echo $i; j=`basename $i`; cp $i/*.result $j; done mkdir merge for j in [1-9]*; do echo $j; if [ `find $j|grep result$|wc -l` != 0 ];then awk -F'\t' 'FNR==1{split(FILENAME,arr,"/"); split(arr[2],arr2,"."); print "id\t"arr2[1]} {print $0}' `find $j|grep result$` > merge/$j.db.tsv; fi; done #file size 0を消しておく find merge/ -size 0|xargs rm #前の結果と統合する mkdir merge2 cd merge2 ln -s ~/work3/metasearch/2023/db4-2-merge/merge2/*.tsv . find ~/work3/metasearch/2023/db5-2-merge/merge/|grep tsv$|while read i; do ln -s $i db5-`basename $i`; done #cd ..; ./calccor test.tsv merge2/ scp db5-*.tsv m128m:/suikou/download9-v251/metasearch/metasearch_dev/metasearch/data/db_merge/
#m128mで cd /suikou/download9-v251/metasearch/metasearch_dev/metasearch/data/db for i in `ls ../db_merge/db5-*.tsv`; do echo $i; j=`basename $i .tsv`; mkdir -p $j; awk -F'\t' -v dir=$j '$1=="id"{filename=dir"/"$2".input"} {print $0 > filename}' ../db_merge/$i ; done
生物種ごとにどこに多いかを表示するデータ用 #たぶん使わない
for i in db*.db; do echo bash run-sp.sh $i; done | xargs -I{} -P16 bash -c "{}"
i=$1 #1 echo $i find $i|grep input$|while read i; do awk -F'\t' 'NR!=1&&$0!~"^unknown"{cnt+=$2; data[$1]=$2} END{for(i in data){n=split(i, arr, ";"); path=arr[1]; data2[path]+=data[i]; for(j=2;j<=n;j++){path=path";"arr[j]; data2[path]+=data[i]}}; for(i in data2){print FILENAME"\t"i"\t"data2[i]/cnt*100}}' $i; done|awk -F'\t' '{data[$2][$3][$1]=1} END{for(i in data){PROCINFO["sorted_in"]="@ind_num_desc"; for(j in data[i]){for(k in data[i][j]){print i"\t"j"\t"k}}}}' > species.$i.txt
cat species.*.txt|sort --parallel=16 --buffer-size=100G -t$'\t' -k1,1 -k2,2gr > species.all
検索用にDB作成
cd /suikou/download9-v251/metasearch/metasearch_dev/metasearch/data cat db_merge/db*.tsv|awk -F'\t' '{if($1=="id"){if(name!=""){delete res; for(i in data){c=data[i]/cnt*100; split(i,arr,";"); str=arr[1]; res[str]+=c; for(j=2;j<=length(arr);j++){str=str";"arr[j]; res[str]+=c}}; for(i in res){print i"\t"res[i]"\t"name}}; delete data; name=$2; cnt=0}else{data[$1]=$2; cnt+=$2}} END{delete res; for(i in data){c=data[i]/cnt*100; split(i,arr,";"); str=arr[1]; res[str]+=c; for(j=2;j<=length(arr);j++){str=str";"arr[j]; res[str]+=c}}; for(i in res){print i"\t"res[i]"\t"name}}' > db.tsv sqlite3 /metasearch_data/species2.db CREATE TABLE IF NOT EXISTS data (sp_name TEXT, percent REAL, srr_name TEXT); .mode tab .import db.tsv data CREATE INDEX index_sp_name ON data(sp_name); .quit ln -sf /metasearch_data/species2.db species.db a="Eukaryota;Obazoa;Opisthokonta;Fungi;Basidiomycota;Agaricomycotina;Agaricomycetes;Lentinus;Lentinus_crinitus" echo -e '.mode tabs\nselect * from data where sp_name = "'"$a"'" order by percent desc' |sqlite3 species.db #DBファイルはSSDに置いておかないと結果が帰ってくるのにかなり時間がかかる。
SRRファイル検索用一覧表作成
find db|grep input$ |awk -F/ '{a=$NF; sub(/.input$/,"",a); print a"\t"$0}' > input.list sort -k1,1 -t$'\t' input.list > input.list.sort #look用に。タブ区切りを指定しておかないとタブ文字も含めてのソートになりlookでヒットしないIDが出てくる
サンプル情報取得用
#m32sで cd work3/metasearch/2023/sra-20240219 cat sralist.2024-03-13.inddbj-metagenom |awk -F'\t' '{split($11,arr,"###"); split($12,arr2,"###"); split($13,arr3,"###"); print $8"\t"$9"\t"arr[3]"\t"arr2[1]"\t"arr3[1]"\t"$3"\t"arr2[6]}' > sralist.2024-03-13.inddbj-metagenom.info #m128mのyoshitake.kazutoshiユーザで cd /suikou/download9-v251/metasearch/metasearch_dev/metasearch/data #sra_info.txt.2024-02-20 -> /suikou/download9-v251/metasearch/data/sra_result_2024_01-19.csv.tsv3 awk -F'\t' 'FILENAME==ARGV[1]{print $0; a[$2]=1} FILENAME==ARTV[2]&&!($2 in a){pirnt $0"\t\t"}' ~/work3/metasearch/2023/sra-20240219/sralist.2024-03-13.inddbj-metagenom.info sra_info.txt.2024-02-20 > sra_info.txt sed 's/"/\\"/g' sra_info.txt > sra_info_forSQL.txt sqlite3 /metasearch_data/species2.db CREATE TABLE IF NOT EXISTS srainfo (srx_id TEXT, srr_id TEXT, srx_name TEXT, srs_org TEXT, srp_name TEXT, srr_reads TEXT, srs_geo TEXT); .mode tab .import sra_info_forSQL.txt srainfo CREATE INDEX index_srr_id ON srainfo(srr_id); .quit
パブリックデータの解析が終わったら 2024
#cd /home/yoshitake.kazutoshi/work3/metasearch/2023/db4-2-merge for i in `ls -d ../db4/[1-9]*|grep -v "[.]tar"`;do echo $i; j=`basename $i`; mkdir $j; cp $i/*.result $j; done for i in `ls -d ../db4-2/[1-9]*|grep -v "[.]tar"`;do echo $i; j=`basename $i`; cp $i/*.result $j; done mkdir merge for j in [1-9]*; do echo $j; if [ `find $j|grep result$|wc -l` != 0 ];then awk -F'\t' 'FNR==1{split(FILENAME,arr,"/"); split(arr[2],arr2,"."); print "id\t"arr2[1]} {print $0}' `find $j|grep result$` > merge/$j.db.tsv; fi; done #file size 0を消しておく find merge/ -size 0|xargs rm #前の結果と統合する mkdir merge2 find ~/work3/metasearch/2023/db3-3-merge/merge/|grep tsv$|while read i; do ln -s $i merge2/db3-`basename $i`; done find ~/work3/metasearch/2023/db4-2-merge/merge/|grep tsv$|while read i; do ln -s $i merge2/db4-`basename $i`; done ~/oldhome/work2/calccor/calccor test1 merge2/
統合した結果から分割したい場合(metasearch DB用)
#m128m /suikou/download9-v251/metasearch/metasearch_dev/metasearch/data/db for i in `ls ../db_merge/`; do echo $i; j=`basename $i .tsv`; mkdir -p $j; awk -F'\t' -v dir=$j '$1=="id"{filename=dir"/"$2".input"} {print $0 > filename}' ../db_merge/$i ; done
テスト用の解析結果作成
cd /home/yoshitake.kazutoshi/work3/metasearch/2023/db3-3-merge/test srr=SRR19572742 #bash run-download.sh $srr bash ~/work3/metasearch/2023/db4-2/run-blast3-4.sh $srr (echo -e "id\t$srr"; cat $srr.result) > $srr.input ~/oldhome/work2/calccor/calccor $srr.input ~/work3/metasearch/2023/db4-2-merge/merge2/
生物種ごとにどこに多いかを表示するデータ用
for i in [1-9] [1-9][0-9] [1-9][0-9][0-9] [1-9][0-9][0-9][0-9]; do echo bash run-sp.sh $i; done | xargs -I{} -P16 bash -c "{}"
i=$1 #1 echo $i find $i|grep result$|while read i; do awk -F'\t' '$0!~"^unknown"{cnt+=$2; data[$1]=$2} END{for(i in data){n=split(i, arr, ";"); path=arr[1]; data2[path]+=data[i]; for(j=2;j<=n;j++){path=path";"arr[j]; data2[path]+=data[i]}}; for(i in data2){print FILENAME"\t"i"\t"data2[i]/cnt*100}}' $i; done|awk -F'\t' '{data[$2][$3][$1]=1} END{for(i in data){PROCINFO["sorted_in"]="@ind_num_desc"; for(j in data[i]){for(k in data[i][j]){print i"\t"j"\t"k}}}}' > species.$i.txt
cat species.*.txt|sort --parallel=16 --buffer-size=100G -t$'\t' -k1,1 -k2,2gr > species.all
検索用にDB作成
cat db_merge/db*.tsv|awk -F'\t' '{if($1=="id"){if(name!=""){delete res; for(i in data){c=data[i]/cnt*100; split(i,arr,";"); str=arr[1]; res[str]+=c; for(j=2;j<=length(arr);j++){str=str";"arr[j]; res[str]+=c}}; for(i in res){print i"\t"res[i]"\t"name}}; delete data; name=$2; cnt=0}else{data[$1]=$2; cnt+=$2}} END{delete res; for(i in data){c=data[i]/cnt*100; split(i,arr,";"); str=arr[1]; res[str]+=c; for(j=2;j<=length(arr);j++){str=str";"arr[j]; res[str]+=c}}; for(i in res){print i"\t"res[i]"\t"name}}' > db.tsv sqlite3 species.db CREATE TABLE IF NOT EXISTS data (sp_name TEXT, percent REAL, srr_name TEXT); .mode tab .import db.tsv data CREATE INDEX index_sp_name ON data(sp_name); .quit a="Eukaryota;Obazoa;Opisthokonta;Fungi;Basidiomycota;Agaricomycotina;Agaricomycetes;Lentinus;Lentinus_crinitus" echo -e '.mode tabs\nselect * from data where sp_name = "'"$a"'" order by percent desc' |sqlite3 species.db #DBファイルはSSDに置いておかないと結果が帰ってくるのにかなり時間がかかる。
(echo "CREATE TABLE IF NOT EXISTS data (sp_name TEXT, percent REAL, srr_name TEXT);" echo ".mode tab"; for i in species.*.txt; do echo ".import $i data"; done echo "CREATE INDEX index_sp_name ON data(sp_name);") | sqlite3 species.db echo -e 'select count(distinct srr_name) from data' |sqlite3 species.db > species.srr.count
echo -e '.mode tabs\nselect * from data where sp_name = "Bacteria;Firmicutes;Bacilli;Acholeplasmatales;Acholeplasmataceae" order by percent desc' |sqlite3 species.db |sort -t$'\t' -k2,2gr|wc -l sp_nameに「"」はないけど、「'」はある。なのでsp_nameを「"」で囲むようにSQLを発行すれば検索できる echo -e '.mode tabs\nselect * from data where sp_name = "'"$a"'" order by percent desc' |sqlite3 species.db
SRRファイル検索用一覧表作成
find db|grep input$ > input.list.tmp more input.list.tmp |awk -F/ '{a=$NF; sub(/.input$/,"",a); print a"\t"$0}' > input.list sort -k1,1 -t$'\t' input.list > input.list.sort #look用に。タブ区切りを指定しておかないとタブ文字も含めてのソートになりlookでヒットしないIDが出てくる
SILVAで解析
run-silva.shとして下記ファイルを作成
i=$1 work=$PWD fastqdir=/suikou/files/r311/sra-fastq/SraId_aa.fastq resultdir=/suikou/files/r311/sra-fastq-silva/SraId_aa mkdir -p /tmp/$i/input cd /tmp/$i cp $fastqdir/$i input /suikou/tool/yoshitake/pp/metagenome~silva_SSU+LSU -c 16 -m 32 -d 50 -t 0.99 input gzip input/*.ssu.blast cp input/*.ssu.blast.gz input/*.ssu.blast.filtered.name.lca.cnt2.input $resultdir cd /tmp rm -rf /tmp/$i
/suikou/files/r311/sra-fastq-silva/SraId_aa
にて下記コマンド
for i in `cat /suikou/files/r311/sra-fastq/SraId_aa`; do qsub -q all.q@m50* -j Y -N $i -pe def_slot 12 ~/qsubsh4 bash /suikou/files/r311/sra-fastq-silva/run-silva.sh $i.fastq.gz; done
DDBJ
cd ~/metadb for i in `cat SraId_ac`; do qsub -j Y -N $i ~/qsubsh8 bash ./run-silva.sh $i.fastq.gz; done
r251
for i in `cat SraId_ad`; do qsub -j Y -N $i -pe def_slot 12 ~/qsubsh4 bash ./run-silva.sh $i.fastq.gz; done
アレイジョブで
qsub run-job.sh SraId_ae
$ more run-job.sh #!/bin/bash #$ -S /bin/bash #$ -cwd #$ -pe def_slot 8 #$ -l mem_req=4.00G,s_vmem=4.00G #$ -t 1:10000 source ~/.bashrc echo "$*" eval "$*" n=$SGE_TASK_ID echo $n id=$1 #SraId_ab i=`head -n $n $id|tail -n 1` bash ./run-silva-range.sh $id $i.fastq.gz
$ more run-silva-range.sh id=$1 #SraId_ab i=$2 #ERR0000.fastq.gz work=$PWD fastqdir=$work/$id.fastq resultdir=$work/sra-fastq-silva/$id mkdir -p /tmp/$i/input mkdir -p $resultdir cd /tmp/$i cp $fastqdir/$i input ~/pp/metagenome~silva_SSU+LSU -c 8 -m 32 -d 50 -t 0.99 input gzip input/*.ssu.blast cp input/*.ssu.blast.gz input/*.ssu.blast.filtered.name.lca.cnt2.input $resultdir cd /tmp rm -rf /tmp/$i
データ転送(shiro, ddbj) @r311:/data/sra-fastq
serv=shiro for i in a b c d e f g h i j k l m n o p q r s t u v w x y z; do id=a$i; ssh $serv mkdir metadb_$id; scp -r run-silva-range.sh run-job-$serv.sh SraId_${id}* $serv:metadb_$id; done
データ転送(m50v251n3) @m768:/suikou/files/m512/backup/r311/sra-fastq
serv=m50v251n3 for i in a b c d e f g h i j k l m n o p q r s t u v w x y z; do id=a$i; ssh $serv mkdir /mnt/r251/metadb_$id; scp -r run-silva-range.sh run-job-$serv.sh SraId_${id}* $serv:/mnt/r251/metadb_$id; done
データ転送(r311) @m512:/data/backup/r311
for i in a b c d e f g h i j k l m n o p q r s t u v w x y z; do id=b$i; mkdir metadb_$id; chmod 777 metadb_$id; cp -r sra-fastq/run-job-r311.sh sra-fastq/run-silva-range.sh sra-fastq/SraId_${id}* metadb_$id; done
データ転送(t7500) @m768:/suikou/files/m512/backup/r311/sra-fastq
serv=t7500 for i in a; do id=c$i; ssh $serv mkdir /ddca2/metadb_$id; scp -r run-silva-range.sh run-job-$serv.sh SraId_${id}* $serv:/ddca2/metadb_$id; done
アレイジョブ投入(ddbj, shiro)
serv=ddbj for i in a b c d e f g h i j k l m n o p q r s t u v w x y z; do id=b$i; cd metadb_$id; ls ; qsub -N $id run-job-$serv.sh SraId_$id; cd ..; done
アレイジョブ投入(r311, m50v251n3, t7500)
serv=r311 for i in a b c d e f g h i j k l m n o p q r s t u v w x y z; do id=b$i; cd metadb_$id; ls ; qsub -p -1000 -N $id run-job-$serv.sh SraId_$id; cd ..; done
shiroのみジョブが勝手に再スケジュールされるので、順番通りに実行されるように強制
qalter -hold_jid ao ap #aoが終わったらapを実行 for i in m n o p q r s t u v w x y z; do id=e$i; echo qalter -hold_jid $jd $id ; jd=$id; done
t7500
source /ddca/sge-8.1.8/default/common/settings.sh serv=t7500 for i in b c d e f g h i j k l m n o p q r s t u v w x y z; do id=c$i; cd /ddca2/metadb_$id; ls ; sed -i 's/-c 8 /-c 4 /' run-silva-range.sh; qsub -p -1000 -N $id run-job-$serv.sh SraId_$id; cd ..; done #ほかのジョブを優先的に投げたい場合は sudo su - source /ddca/sge-8.1.8/default/common/settings.sh qalter -p 1000 <jobid>
進行状況確認
for i in m256y m50v251n3 shiro ddbj t7500; do ssh $i qstat|egrep " (h|)qw"|head -n 1; done
解析が終わったら
解析したサーバ上で
cd metadb tar zvcf log.tar.gz *.o* mv log.tar.gz sra-fastq-silva/SraId_[a-z][a-z]/ mv run*.sh sra-fastq-silva/SraId_[a-z][a-z]/ #まとめて for i in metadb_ce metadb_cf metadb_cg metadb_ch metadb_ci; do echo $i; cd $i; tar zvcf log.tar.gz *.o*; mv log.tar.gz sra-fastq-silva/SraId_[a-z][a-z]/; mv run*.sh sra-fastq-silva/SraId_[a-z][a-z]/; cd ..; done
m512から
cd /data/backup/r311/sra-fastq-silva-result rsync -av --progress shiro:metadb_ah/sra-fastq-silva/SraId_[a-z][a-z] .
m50v251n3の場合@m512
cd /data/backup/r311/sra-fastq-silva-result rsync -av --progress yoshitake@133.11.144.11:/data/metadb_dj/sra-fastq-silva/SraId_[a-z][a-z] .
t7500にr311から
cd /mnt/backup/r311/sra-fastq-silva-result rsync -av --progress t7500:/ddca2/metadb_c[mn]/sra-fastq-silva/SraId_[a-z][a-z] .
r311での結果の場合
id=bb cd /data/backup/r311/metadb_$id tar zvcf log.tar.gz *.o* mv log.tar.gz sra-fastq-silva/SraId_[a-z][a-z]/ mv run*.sh sra-fastq-silva/SraId_[a-z][a-z]/ sudo mv -i sra-fastq-silva/SraId_[a-z][a-z] ../sra-fastq-silva-result/ cd .. rm -rf metadb_$id #まとめて for id in ah ai dk dl dm do dp ; do echo $id; cd /data/backup/r311/metadb_$id; tar zvcf log.tar.gz *.o*; mv log.tar.gz sra-fastq-silva/SraId_[a-z][a-z]/; mv run*.sh sra-fastq-silva/SraId_[a-z][a-z]/; sudo mv -i sra-fastq-silva/SraId_[a-z][a-z] ../sra-fastq-silva-result/; cd ..; rm -rf metadb_$id; done
本番サーバへ転送
/data/backup/r311/sra-fastq-silva-result@m512
for i in SraId_*; do echo $i; mkdir db/$i; cd db/$i; find ../../$i|grep .input$|xargs -I{} bash -c "ln -s {} ."; cd ../..; done find db|grep .input$|xargs zip input.zip
for i in SraId_*; do if [ ! -e db/$i ]; then echo $i; mkdir db/$i; cd db/$i; find ../../$i|grep .input$|xargs -I{} bash -c "ln -s {} ."; cd ../..; fi; done find db|grep .input$|xargs zip input.zip
まとめたDBファイルの作成
for i in db/SraId_* ; do echo $i; cat `find $i` > db_merge/`basename $i`.tsv; done
genusでのカウントファイル作成
@/home/yoshitake/yoshitake/db_merge_genus for i in `ls ../db_merge`; do echo $i; awk -F'\t' 'FILENAME==ARGV[1]{a["root;"$2]=$3} FILENAME==ARGV[2]{if($1=="id"){if(FNR>1){for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}; print $0; delete cnt}else{cnt[a[$1]]+=$2}} END{for(i in cnt){if(i!="") {print i"\t"cnt[i]}}}' ~/yoshitake/SILVA_132_SSU-LSU_Ref.fasta.name.genus ~/yoshitake/db_merge/$i > ~/yoshitake/db_merge_genus/$i; done #グリッドエンジンで処理するには for i in `ls ~/yoshitake/db_merge`; do echo $i; qsub -pe def_slot 1 ~/qsubsh4 bash ~/yoshitake/run-db_merge_genus $i; done
speciesでのカウントファイル作成
@/home/yoshitake/yoshitake/db_merge_species i=SraId_aa.tsv awk -F'\t' 'FILENAME==ARGV[1]{a["root;"$2]=$3} FILENAME==ARGV[2]{if($1=="id"){if(FNR>1){for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}; print $0; delete cnt}else{cnt[a[$1]]+=$2}} END{for(i in cnt){if(i!="") {print i"\t"cnt[i]}}}' ~/yoshitake/SILVA_132_SSU-LSU_Ref.fasta.name.species ~/yoshitake/db_merge/$i > ~/yoshitake/db_merge_species/$i #グリッドエンジンで処理するには for i in `ls ~/yoshitake/db_merge`; do echo $i; qsub -pe def_slot 1 ~/qsubsh4 bash ~/yoshitake/run-db_merge_species $i; done
マージしたDBファイルをバラバラにする
#all for i in `ls ../db_merge/`; do echo $i; j=`basename $i .tsv`; mkdir -p $j; awk -F'\t' -v dir=$j '$1=="id"{filename=dir"/"$2".fasta.ssu.blast.filtered.name.lca.cnt2.input"} {print $0 > filename}' ../db_merge/$i ; done #genus for i in `ls ../db_merge_genus/`; do echo $i; j=`basename $i .tsv`; mkdir -p $j; awk -F'\t' -v dir=$j '$1=="id"{filename=dir"/"$2".fasta.ssu.blast.filtered.name.lca.cnt2.input"} {print $0 > filename}' ../db_merge_genus/$i ; done #species for i in `ls ../db_merge_species/`; do echo $i; j=`basename $i .tsv`; mkdir -p $j; awk -F'\t' -v dir=$j '$1=="id"{filename=dir"/"$2".fasta.ssu.blast.filtered.name.lca.cnt2.input"} {print $0 > filename}' ../db_merge_species/$i ; done
blastのフィルターパラメータ変更
@~/work2/metagenome-db/chnage-blast-filter #for i in /suikou/files/m512/backup/r311/sra-fastq/SraId_aa; do i0=`basename $i`; mkdir -p $i0; for j0 in `cat $i`; do echo "cd $i0; bash ../run-change-blast-filter.sh $i0 $j0"; done; done|xargs -I{} -P6 bash -c "{}" #m512に結果ファイルがある状態での実行 for i in /suikou/files/m512/backup/r311/sra-fastq-silva-result/SraId_*; do i0=`basename $i`; mkdir -p $i0; for j0 in `cat /suikou/files/m512/backup/r311/sra-fastq/$i0`; do echo "cd $i0; bash ../run-change-blast-filter.sh2 $i0 $j0"; done; done|xargs -I{} -P6 bash -c "{}"
#i0=SraId_dn #j0=SRR7870496 i0=$1 j0=$2 j=/suikou/files/r311/sra-fastq-silva-result/$i0/$j0.fastq.fasta bitscore=100 top=0.95 ref=/suikou/db/silva/2018-10-10/SILVA_132_SSU-LSU_Ref.fasta mkdir -p /tmp/$j0 workdir="$PWD" cd /tmp/$j0 zcat $j.ssu.blast.gz|awk -F'\t' '$12>'$bitscore'{if(a[$1]==1){if($12>=topbit*'$top'){print $0}}else{a[$1]=1; topbit=$12; print $0}}'|awk 'FILENAME==ARGV[1]{if(FNR%4==1){name=substr($1,2)}else if(FNR%4==2){len[name]=length($0); l+=length($0); n++}} FILENAME==ARGV[2]{if(len[$1]==0){len[$1]=l/n}; if($3>90&&$4>len[$1]*0.8){print $0}}' <(zcat /suikou/files/m512/backup/r311/sra-fastq/$i0.fastq/$j0.fastq.gz) /dev/stdin > `basename $j`.ssu.blast.filtered echo "##determine LCA" awk -F'\t' 'FILENAME==ARGV[1]{name[$1]=$2} FILENAME==ARGV[2]{print name[$2]"\t"$0}' $ref.name `basename $j`.ssu.blast.filtered > `basename $j`.ssu.blast.filtered.name awk -F'\t' ' function searchLCA(data, i, j, res, res2, str, n, stopflag){ for(i in data){ if(n==0){n=split(i,res,";")} else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}} } if(res[1]!=""){str=res[1]}else{str="unknown"; stopflag=1}; for(i=2;i<=n;i++){if(stopflag==0 && res[i]!=""){str=str";"res[i]}else{stopflag=1}} return str; } { if($2!=old){if(old!=""){print searchLCA(data)"\t"oldstr}; delete data; data[$1]=1; old=$2; oldstr=$0} else{data[$1]=1} } END{if(length(data)>0){print searchLCA(data)"\t"oldstr}} ' `basename $j`.ssu.blast.filtered.name > `basename $j`.ssu.blast.filtered.name.lca awk -F'\t' '{cnt[$1]++} END{PROCINFO["sorted_in"]="@val_num_desc"; for(i in cnt){print i"\t"cnt[i]}}' `basename $j`.ssu.blast.filtered.name.lca > `basename $j`.ssu.blast.filtered.name.lca.cnt awk -F'\t' '{print "root;"$0}' `basename $j`.ssu.blast.filtered.name.lca.cnt > `basename $j`.ssu.blast.filtered.name.lca.cnt2 echo -e "id\t`basename $j.ssu.blast.filtered.name.lca.cnt2|sed 's/.fasta.ssu.blast.filtered.name.lca.cnt2$//'`" > `basename $j`.ssu.blast.filtered.name.lca.cnt2.input cat `basename $j`.ssu.blast.filtered.name.lca.cnt2 >> `basename $j`.ssu.blast.filtered.name.lca.cnt2.input rm $j0.fastq.fasta.ssu.blast.filtered $j0.fastq.fasta.ssu.blast.filtered.name $j0.fastq.fasta.ssu.blast.filtered.name.lca $j0.fastq.fasta.ssu.blast.filtered.name.lca.cnt $j0.fastq.fasta.ssu.blast.filtered.name.lca.cnt2 mv `basename $j`.ssu.blast.filtered.name.lca.cnt2.input $workdir cd $workdir rmdir /tmp/$j0
DBファイルの作成 @m32s:~/work2/metagenome-db/chnage-blast-filter
mkdir db_merge for i in SraId_* ; do echo $i; cat `find $i|grep input$` > db_merge/`basename $i`.tsv; done mkdir db_merge_genus #for i in `ls db_merge`; do echo $i; qsub -j y -pe def_slot 1 ~/qsubsh4 bash ./run-db_merge_genus $i; done for i in `ls db_merge`; do echo "echo $i; bash ./run-db_merge_genus $i"; done|xargs -I{} -P6 bash -c "{}" mkdir db_merge_species #for i in `ls db_merge`; do echo $i; qsub -j y -pe def_slot 1 ~/qsubsh4 bash ./run-db_merge_species $i; done for i in `ls db_merge`; do echo "echo $i; bash ./run-db_merge_species $i"; done|xargs -I{} -P6 bash -c "{}"
#cat run-db_merge_genus i=$1 awk -F'\t' 'FILENAME==ARGV[1]{a["root;"$2]=$3} FILENAME==ARGV[2]{if($1=="id"){if(FNR>1){for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}; print $0; delete cnt}else{cnt[a[$1]]+=$2}} END{for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}' ~/work2/metagenome-db/chnage-blast-filter/SILVA_132_SSU-LSU_Ref.fasta.name.genus ~/work2/metagenome-db/chnage-blast-filter/db_merge/$i > ~/work2/metagenome-db/chnage-blast-filter/db_merge_genus/$i
#cat run-db_merge_species i=$1 awk -F'\t' 'FILENAME==ARGV[1]{a["root;"$2]=$3} FILENAME==ARGV[2]{if($1=="id"){if(FNR>1){for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}; print $0; delete cnt}else{cnt[a[$1]]+=$2}} END{for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}' ~/work2/metagenome-db/chnage-blast-filter/SILVA_132_SSU-LSU_Ref.fasta.name.species ~/work2/metagenome-db/chnage-blast-filter/db_merge/$i > ~/work2/metagenome-db/chnage-blast-filter/db_merge_species/$i
本番サーバへ転送して分割して配置
#in ~/work2/metagenome-db/chnage-blast-filter @m32s rsync -av --progress db_merge_genus/ m50v251n3:/usr/local/yoshitake/db_genus_merge/ rsync -av --progress db_merge_species/ m50v251n3:/usr/local/yoshitake/db_species_merge/ rsync -av --progress db_merge/ m50v251n3:/usr/local/yoshitake/db_merge/ ssh m50v251n3 cd /usr/local/yoshitake #all cd /usr/local/yoshitake/db for i in `ls ../db_merge/`; do echo $i; j=`basename $i .tsv`; mkdir -p $j; awk -F'\t' -v dir=$j '$1=="id"{filename=dir"/"$2".fasta.ssu.blast.filtered.name.lca.cnt2.input"} {print $0 > filename}' ../db_merge/$i ; done #genus cd /usr/local/yoshitake/db_genus for i in `ls ../db_genus_merge/`; do echo $i; j=`basename $i .tsv`; mkdir -p $j; awk -F'\t' -v dir=$j '$1=="id"{filename=dir"/"$2".fasta.ssu.blast.filtered.name.lca.cnt2.input"} {print $0 > filename}' ../db_genus_merge/$i ; done #species cd /usr/local/yoshitake/db_species for i in `ls ../db_species_merge/`; do echo $i; j=`basename $i .tsv`; mkdir -p $j; awk -F'\t' -v dir=$j '$1=="id"{filename=dir"/"$2".fasta.ssu.blast.filtered.name.lca.cnt2.input"} {print $0 > filename}' ../db_species_merge/$i ; done for i in 1 2 4 5 6 7 8 9; do rsync -av --progress /usr/local/yoshitake m50v251n$i:/usr/local/; done
HTMLファイルをコマンドから作成
#@yoshitake/tt cp -r ~/metasearch/tmp . rm tmp/*/*.gz tmp/*/*.fq #for i in tmp/*/*.tsv; do bash run-rerun.sh $i; done for i in tmp/*/*.tsv; do echo bash run-rerun.sh $i; done|xargs -I {} -P 15 bash -c "{}" for i in tmp/*/*/result.html; do echo "<a href='$i'>$i</a>"; done > list.html
#i=tmp/641b199531d9634a9710474c83f1cd63/641b199531d9634a9710474c83f1cd63.fq.tsv i=$1 name="`cat $(dirname $i)/$(dirname $i|sed 's/tmp.//')/result.html |grep h3|head -n 1|sed 's/<\/.*//; s/.*>//'`" #species mkdir -p $(dirname $i)/$(dirname $i|sed 's/tmp.//') awk -F'\t' 'FILENAME==ARGV[1]{a["root;"$2]=$3} FILENAME==ARGV[2]{if($1=="id"){if(FNR>1){for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}; print $0; delete cnt}else{cnt[a[$1]]+=$2}} END{for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}' ~/yoshitake/SILVA_132_SSU-LSU_Ref.fasta.name.species $i > $i.species.tsv ./calccor $i.species.tsv ~/yoshitake/db_merge_species singularity exec ~/metasearch/script/python3_env_mako_installed.sif python create_page_species.py "`echo $i.species.tsv|sed 's/.tsv$//'`" "$name" singularity run ~/metasearch/script/krona_v2.7.1_cv1.sif ktImportText $(dirname $i)/result.kraken -o $(dirname $i)/$(dirname $i|sed 's/tmp.//')/krona_out.html mv $(dirname $i)/$(dirname $i|sed 's/tmp.//') $(dirname $i)/$(dirname $i|sed 's/tmp.//')s #genus mkdir -p $(dirname $i)/$(dirname $i|sed 's/tmp.//') awk -F'\t' 'FILENAME==ARGV[1]{a["root;"$2]=$3} FILENAME==ARGV[2]{if($1=="id"){if(FNR>1){for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}; print $0; delete cnt}else{cnt[a[$1]]+=$2}} END{for(i in cnt){if(i!=""){print i"\t"cnt[i]}}}' ~/yoshitake/SILVA_132_SSU-LSU_Ref.fasta.name.genus $i > $i.genus.tsv ./calccor $i.genus.tsv ~/yoshitake/db_merge_genus singularity exec ~/metasearch/script/python3_env_mako_installed.sif python create_page_genus.py "`echo $i.genus.tsv|sed 's/.tsv$//'`" "$name" singularity run ~/metasearch/script/krona_v2.7.1_cv1.sif ktImportText $(dirname $i)/result.kraken -o $(dirname $i)/$(dirname $i|sed 's/tmp.//')/krona_out.html mv $(dirname $i)/$(dirname $i|sed 's/tmp.//') $(dirname $i)/$(dirname $i|sed 's/tmp.//')g #totalやり直し mkdir -p $(dirname $i)/$(dirname $i|sed 's/tmp.//') ./calccor $i ~/yoshitake/db_merge singularity exec ~/metasearch/script/python3_env_mako_installed.sif python create_page.py "`echo $i|sed 's/.tsv$//'`" "$name" singularity run ~/metasearch/script/krona_v2.7.1_cv1.sif ktImportText $(dirname $i)/result.kraken -o $(dirname $i)/$(dirname $i|sed 's/tmp.//')/krona_out.html