sraデータダウンロード_メタデータダウンロード

SRAメタデータダウンロード 2024/03/13

const fs = require('fs');
const zlib = require('zlib');
const tar = require('tar');
const xml2js = require('xml2js');

// tar.gzファイルのパス
// 最初のコマンドライン引数を取得
const firstArg = process.argv[2];
const tarGzFilePath = firstArg //'NCBI_SRA_Metadata_Full_20240219.tar.gz';

// gunzipストリームを作成
const gunzipStream = fs.createReadStream(tarGzFilePath).pipe(zlib.createGunzip());

// tarストリームを作成
const extractStream = gunzipStream.pipe(new tar.Parse());

//出力結果保存用
let res_srr = []
let res_srx = []
let res_srp = []
let res_srs = []

function removeFile(filePath){
  if (fs.existsSync(filePath)) {
    fs.unlinkSync(filePath);
    console.log(filePath + 'が削除されました。');
  } else {
    console.log(filePath + 'が存在しません。');
  }
}

removeFile("output_srr.txt")
removeFile("output_srx.txt")
removeFile("output_srp.txt")
removeFile("output_srs.txt")

// ファイルが解凍されるたびにイベントを発生
extractStream.on('entry', (entry) => {
  // XMLファイルのみ処理
  if (entry.path.endsWith('.xml')) {
    let xmlData = '';

    // XMLデータを取得
    entry.on('data', (chunk) => {
      xmlData += chunk.toString();
    });

    // XMLファイルの処理が完了したら
    entry.on('end', () => {
      // XMLをJSONに変換
      xml2js.parseString(xmlData, (err, result) => {
        if (err) {
          console.error('Error parsing XML:', err);
        } else {
         try{
          //console.log('JSON data:', JSON.stringify(result, null, 2));
          if("RUN_SET" in result){
            for(let srr of result["RUN_SET"]["RUN"]){
              //console.log("srr: ", srr)
              let idsrr = ""
              let idsrx = ""
              if("IDENTIFIERS" in srr && "PRIMARY_ID" in srr["IDENTIFIERS"][0]){
                idsrr = srr["IDENTIFIERS"][0]["PRIMARY_ID"][0]
              }
              if("EXPERIMENT_REF" in srr && "IDENTIFIERS" in srr["EXPERIMENT_REF"][0] && "PRIMARY_ID" in srr["EXPERIMENT_REF"][0]["IDENTIFIERS"][0]){
                idsrx = srr["EXPERIMENT_REF"][0]["IDENTIFIERS"][0]["PRIMARY_ID"][0]
              }else if("EXPERIMENT_REF" in srr && "$" in srr["EXPERIMENT_REF"][0] && "accession" in srr["EXPERIMENT_REF"][0]["$"]){
                idsrx = srr["EXPERIMENT_REF"][0]["$"]["accession"]
              }
              let res = idsrr+"\t"+idsrx
              console.log(res)
              res_srr.push(res)
              if(res_srr.length>=10000){writefile("output_srr.txt",res_srr); res_srr=[]}
            }
          }else if("STUDY_SET" in result){
            for(let srp of result["STUDY_SET"]["STUDY"]){
              let idsrp = ""
              let srptitle = ""
              let srpabstract = ""
              let srpdesc = ""
              let srptype = ""
              if("IDENTIFIERS" in srp && "PRIMARY_ID" in srp["IDENTIFIERS"][0]){
                idsrp = srp["IDENTIFIERS"][0]["PRIMARY_ID"][0]
              }
              if("DESCRIPTOR" in srp && "STUDY_TITLE" in srp["DESCRIPTOR"][0]){
                srptitle = srp["DESCRIPTOR"][0]["STUDY_TITLE"][0].replace(/\t/g, " ");
              }
              if("DESCRIPTOR" in srp && "STUDY_DESCRIPTION" in srp["DESCRIPTOR"][0]){
                srpdesc = srp["DESCRIPTOR"][0]["STUDY_DESCRIPTION"][0].replace(/\t/g, " ");
              }
              if("DESCRIPTOR" in srp && "STUDY_TYPE" in srp["DESCRIPTOR"][0] && "$" in srp["DESCRIPTOR"][0]["STUDY_TYPE"][0] && "existing_study_type" in srp["DESCRIPTOR"][0]["STUDY_TYPE"][0]["$"]){
                srptype = srp["DESCRIPTOR"][0]["STUDY_TYPE"][0]["$"]["existing_study_type"]
              }
              if("DESCRIPTOR" in srp && "STUDY_ABSTRACT" in srp["DESCRIPTOR"][0]){
                srpabstract = srp["DESCRIPTOR"][0]["STUDY_ABSTRACT"][0].replace(/\t/g, " ");
              }
              let res = idsrp+"\t"+srptitle+"\t"+srpabstract+"\t"+srpdesc+"\t"+srptype
              console.log(res)
              res_srp.push(res)
              if(res_srp.length>=10000){writefile("output_srp.txt",res_srp); res_srp=[]}
            }
          }else if("SAMPLE_SET" in result){
            //console.log(JSON.stringify(result, null, 2))
            for(let srs of result["SAMPLE_SET"]["SAMPLE"]){
              let idsrs = ""
              let srsname = ""
              let srsdesc = ""
              let srstitle = ""
              let srsdate = ""
              let srslatlon = ""
              if("IDENTIFIERS" in srs && "PRIMARY_ID" in srs["IDENTIFIERS"][0]){
                idsrs = srs["IDENTIFIERS"][0]["PRIMARY_ID"][0]
              }
              if("SAMPLE_NAME" in srs && "SCIENTIFIC_NAME" in srs["SAMPLE_NAME"][0]){
                srsname = srs["SAMPLE_NAME"][0]["SCIENTIFIC_NAME"][0]
              }
              if("DESCRIPTION" in srs){
                srsdesc = srs["DESCRIPTION"][0].replace(/\t/g, " ");
              }
              if("TITLE" in srs){
                srstitle = srs["TITLE"][0].replace(/\t/g, " ");
              }
              if("SAMPLE_ATTRIBUTES" in srs && "SAMPLE_ATTRIBUTE" in srs["SAMPLE_ATTRIBUTES"][0]){
                srs["SAMPLE_ATTRIBUTES"][0]["SAMPLE_ATTRIBUTE"].forEach(att => {
                  if(att["TAG"][0]==="collection_date"){
                    srsdate = att["VALUE"][0]
                  }else if(att["TAG"][0]==="lat_lon"){
                    srslatlon = att["VALUE"][0]
                  }
                })
              }
              let res = idsrs+"\t"+srsname+"\t"+srsdesc+"\t"+srstitle+"\t"+srsdate+"\t"+srslatlon
              console.log(res)
              res_srs.push(res)
              if(res_srs.length>=10000){writefile("output_srs.txt",res_srs); res_srs=[]}
            }
          }else if("EXPERIMENT_SET" in result){
            //console.log(JSON.stringify(result, null, 2))
            for(let srx of result["EXPERIMENT_SET"]["EXPERIMENT"]){
              let idsrx = ""
              let idsrp = ""
              let idsrs = ""
              let srxtitle = ""
              let srxplat = ""
              let srxplat2 = ""
              if("IDENTIFIERS" in srx && "PRIMARY_ID" in srx["IDENTIFIERS"][0]){
                idsrx = srx["IDENTIFIERS"][0]["PRIMARY_ID"][0]
              }
              if("TITLE" in srx){
                srxtitle = srx["TITLE"][0]
              }
              if("STUDY_REF" in srx && "IDENTIFIERS" in srx["STUDY_REF"][0] && "PRIMARY_ID" in srx["STUDY_REF"][0]["IDENTIFIERS"][0]){
                idsrp = srx["STUDY_REF"][0]["IDENTIFIERS"][0]["PRIMARY_ID"][0]
              }else if("STUDY_REF" in srx && "$" in srx["STUDY_REF"][0] && "accession" in srx["STUDY_REF"][0]["$"]){
                idsrp = srx["STUDY_REF"][0]["$"]["accession"]
              }
              if("PLATFORM" in srx){
                srxplat = Object.keys(srx["PLATFORM"][0])[0]
                if("INSTRUMENT_MODEL" in srx["PLATFORM"][0][srxplat][0]){
                  srxplat2 = srx["PLATFORM"][0][srxplat][0]["INSTRUMENT_MODEL"][0]
                }
              }
              if("DESIGN" in srx){
                for(let srs of srx["DESIGN"]){
                  if("SAMPLE_DESCRIPTOR" in srs && "IDENTIFIERS" in srs["SAMPLE_DESCRIPTOR"][0] && "PRIMARY_ID" in srs["SAMPLE_DESCRIPTOR"][0]["IDENTIFIERS"][0]){
                    idsrs = srs["SAMPLE_DESCRIPTOR"][0]["IDENTIFIERS"][0]["PRIMARY_ID"][0]
                  }else if("SAMPLE_DESCRIPTOR" in srs && "$" in srs["SAMPLE_DESCRIPTOR"][0] && "accession" in srs["SAMPLE_DESCRIPTOR"][0]["$"]){
                    idsrs = srs["SAMPLE_DESCRIPTOR"][0]["$"]["accession"]
                  }
                  //基本的にSRX1つに対してSRSは一つみたいだからもっとループの外で読んでも良いかも
                  let res = idsrx+"\t"+idsrp+"\t"+idsrs+"\t"+srxtitle+"\t"+srxplat+"\t"+srxplat2
                  console.log(res)
                  res_srx.push(res)
                  if(res_srx.length>=10000){writefile("output_srx.txt",res_srx); res_srx=[]}
                }
              }
            }
          }
         }catch(e){
          console.error(e)
          console.error(JSON.stringify(result, null, 2))
         }
        }
      });
    });
  } else {
    // XMLファイル以外は無視
    entry.resume();
  }
});

// エラー処理
extractStream.on('error', (err) => {
  console.error('Error extracting file:', err);
});

// 終了時処理
extractStream.on('end', () => {
  console.log('Finished');
  writefile("output_srr.txt", res_srr)
  writefile("output_srx.txt", res_srx)
  writefile("output_srp.txt", res_srp)
  writefile("output_srs.txt", res_srs)
});

function writefile(filePath, content){
  try {
    fs.appendFileSync(filePath, content.join("\n")+"\n");
    console.log(filePath+'が正常に書き込まれました。');
  } catch (err) {
    console.error(filePath+'の書き込み中にエラーが発生しました:', err);
  }
}
cd ~/work3/metasearch/2023/sra-20240219/
file=`curl https://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/|grep NCBI_SRA_Metadata_Full_|tail -n 1|sed 's%</a>.*%%; s/.*>//'`
# NCBI_SRA_Metadata_Full_20240219.tar.gz
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/$file
npm install zlib tar xml2js
node --max-old-space-size=204800 run3.js $file #200GB しかしこうやってもメモリー30GB使う前に「FATAL ERROR: Scavenger: semi-space copy Allocation failed - JavaScript heap out of memory」で落ちたのでスクリプトを修正して分割出力するようにした。

#メモリー50GBくらい使う
awk -F'\t' 'FILENAME==ARGV[1]{str=$2; for(i=3;i<=NF;i++){str=str"###"$i}; srs[$1]=str} FILENAME==ARGV[2]{str=$2; for(i=3;i<=NF;i++){str=str"###"$i}; srp[$1]=str} FILENAME==ARGV[3]{str=$2; for(i=3;i<=NF;i++){str=str"###"$i}; srx[$1]=str; srx2srp[$1]=$2; srx2srs[$1]=$3} FILENAME==ARGV[4]{srxid=$2; print $0"\t"srx[srxid]"\t"srs[srx2srs[srxid]]"\t"srp[srx2srp[srxid]]}' output_srs.txt output_srp.txt output_srx.txt output_srr.txt > output.txt

ssh ddbj
qlogin
cp /usr/local/resources/dra/meta/list/sralist sralist.2024-03-13
exit
exit
scp ddbj:sralist.2024-03-13 .

cat output.txt| awk -F'\t' '{split($4,arr,"###"); split(arr[5],arr2," "); if(length(arr2)==4){split($3,arr,"###"); print $0}}'|awk -F'\t' 'FILENAME==ARGV[1]{dra[$7]=$0} FILENAME==ARGV[2]&& $1 in dra{print dra[$1]"\t"$0}' sralist.2024-03-13 /dev/stdin > sralist.2024-03-13.inddbj-with-latlon

cat output.txt |grep -i metagenom|awk -F'\t' 'FILENAME==ARGV[1]{dra[$7]=$0} FILENAME==ARGV[2]&& $1 in dra{print dra[$1]"\t"$0}' sralist.2024-03-13 /dev/stdin > sralist.2024-03-13.inddbj-metagenom

for i in db3 db3-2 db3-3 db4 db4-2; do ls ../$i|grep "^[1-9][0-9]*$"|while read j; do ls ../$i/$j|grep "[.]fasta[.]gz$"|sed 's/[.]fasta[.]gz$//; s/[.]lite$//'; done; done > downloaded_fasta_list.db3-db4.txt
more downloaded_fasta_list.db3-db4.txt|sort|uniq > downloaded_fasta_list.db3-db4.sort.txt

awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($7 in a){print $0}' downloaded_fasta_list.db3-db4.sort.txt sralist.2024-03-13.inddbj-with-latlon > sralist.2024-03-13.inddbj-with-latlon.new
awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($7 in a){print $0}' downloaded_fasta_list.db3-db4.sort.txt sralist.2024-03-13.inddbj-metagenom > sralist.2024-03-13.inddbj-metagenom.new

#mitosearchでの表示用
cat output.txt| awk -F'\t' '{split($4,arr,"###"); split(arr[5],arr2," "); if(length(arr2)==4){print $0}}'|awk -F'\t' 'FILENAME==ARGV[1]{drr[$1]=1} FILENAME==ARGV[2]&& $1 in drr{print $0}' downloaded_fasta_list.db3-db4.sort.txt /dev/stdin > downloaded_fasta_list.db3-db4.sort.txt.with-latlon
mkdir ../test2
awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME!=ARGV[1]{if(FNR==1){print FILENAME}; if($0~"^id"){if($2 in a){id=$2}else{id=""}}; if(id!=""){print $0 > "../test2/"FILENAME}}' downloaded_fasta_list.db3-db4.sort.txt.with-latlon *.tsv
cd ../test2
for i in *.tsv; do mkdir -p ../db_fish_ja/$i; done
for i in *.tsv; do awk -F'\t' '{if(FNR==1){print FILENAME}; if($0~"^id"){id=$2}; if(id!=""){print $0 > "../db_fish_ja/"FILENAME"/"id".input"}}' $i; done 
for i in *.tsv; do awk -F'\t' '{if(FNR==1){print FILENAME > "/dev/stderr"}; if($0~"^id"){id=$2; print id"\t"FILENAME"/"id".input"}}' $i; done > ../mitosearch/Mitosearch/data/fish/input_file_path.txt  
more ~/work3/metasearch/2023/sra-20240219/downloaded_fasta_list.db3-db4.sort.txt.with-latlon|awk -F'\t' '{split($4,arr,"###"); print $1"\t"arr[5]"\t"arr[4]}' > lat-long-date.txt
more lat-long-date.txt|awk -F'\t' '{print $1"\t1"}' > mapwater.result.txt
#node --max-old-space-size=204800 01blockSeparatorForLevel1to17meta.js ja
for i in `seq 2 18`; do node --max-old-space-size=204800 01blockSeparatorForLevel2to18meta.js ja $i & done; wait


#metasearch用
cat sralist.2024-03-13.inddbj-metagenom.new |sed 's%/ddbj_database%/usr/local/resources%'|awk -F'\t' '{print $7"\t"$1}'  > sralist.new.txt
pigz sralist.new.txt
scp sralist.new.txt.gz ddbj:
ssh ddbj
qlogin
mkdir db5
mv sralist.new.txt.gz
cd db5

#5000サンプルずつ抽出する
ln -s ~/db4/run-get-reads.sh .
zcat sralist.new.txt.gz | awk -F'\t' -v block=5000 '{n=int((NR-1)/block)+1; print n"\t"$1"\t"$2 > "split."n}'
#split.1は「1       ERR8268026      /usr/local/resources/dra/sralite/ByExp/litesra/ERX/ERX783/ERX7830965/ERR8268026/ERR8268026.sra」みたいなデータ
mkdir -p log
for i in split.*; do qsub -j y -o log/getreads.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done

失敗探し

##失敗したジョブを探す
#for i in log/getreads.*; do if [ `tail -n 1 $i|grep "CMD_FIN_STATUS: 0"|wc -l` = 0 ]; then echo $i; fi; done | tee error.log

#アウトプットがないファイルを探す
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($2 in a){print $0}' /dev/stdin <(cat split.*) > error.job
mkdir rerun
awk -F'\t' -v block=100 '{n=int((NR-1)/block)+1; print $0 > "rerun/split."n}' error.job
for i in rerun/split.*; do qsub -j y -o log/getreads2.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done

失敗探し2回目

#アウトプットがないファイルを探す
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($2 in a){print $0}' /dev/stdin <(cat split.*) > error2.job
mkdir rerun2
awk -F'\t' -v block=1 '{n=int((NR-1)/block)+1; print $0 > "rerun2/split."n}' error2.job
for i in rerun2/split.*; do qsub -j y -o log/getreads3.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done

失敗探し3回目

##失敗したジョブを探す
#for i in log/getreads3.rerun2/split.*; do if [ `tail -n 1 $i|grep "CMD_FIN_STATUS: 0"|wc -l` = 0 ]; then echo $i; fi; done | tee error3.log

#アウトプットがないファイルを探す(上のerror3.logと同じ結果)
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($2 in a){print $0}' /dev/stdin <(cat split.*) > error3.job
mkdir rerun3
awk -F'\t' -v block=1 '{n=int((NR-1)/block)+1; print $0 > "rerun3/split."n}' error3.job
for i in rerun3/split.*; do qsub -j y -o log/getreads4.$i -N getreads ~/qsub8GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done

失敗探し4回目

#アウトプットがないファイルを探す
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($2 in a){print $0}' /dev/stdin <(cat split.*) > error4.job
#失敗なしだった。もし失敗なら↓
#mkdir rerun4
#awk -F'\t' -v block=1 '{n=int((NR-1)/block)+1; print $0 > "rerun4/split."n}' error4.job
for i in rerun4/split.*; do qsub -j y -o log/getreads5.$i -N getreads ~/qsub8GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done

転送用に圧縮

for i in $(seq 1 `ls split.*|wc -l`); do qsub -j y -o log/tarreads.$i -N tarreads ~/qsub2GB.sh tar cvf $i.tar $i; done

#tail -n 1 log/tarreads.*|grep ": 0"|wc -l #でsplit.*の数と一致するか調べる

m32sで

mkdir ~/work3/metasearch/2023/db5
cd ~/work3/metasearch/2023/db5
cp ../db4-2/run-blast3-4.sh .
rsync -av --progress ddbj:db5/*.tar .

for i in [1-9]*.tar; do
 tar vxf $i
 j=`basename $i .tar`
 cd $j;
 ls *.fasta.gz|while read i2; do
  srr=`basename $i2 .fasta.gz`;
  qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr";
 done;
 cd ..;
done

エラー調査

for k in *.tar; do k=`basename $k .tar`; echo $k; ls $k|grep gz$|while read i; do j=`basename $i .fasta.gz`; if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $k/$i; fi; done; done > check.txt

もしエラーがあれば

grep gz check.txt|while read i; do 
 dir=`dirname $i`
 srr=`basename $i .fasta.gz`
 cd $dir
 qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr";
 cd ..
done

エラー調査2回目

grep gz check.txt|while read i; do
 k=`dirname $i`
 j=`basename $i .fasta.gz`
 if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $i; fi
done > check2.txt

grep gz check2.txt|while read i; do 
 dir=`dirname $i`
 srr=`basename $i .fasta.gz`
 cd $dir
 qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr";
 cd ..
done

エラー調査3回目

grep gz check2.txt|while read i; do
 k=`dirname $i`
 j=`basename $i .fasta.gz`
 if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $i; fi
done > check3.txt

#塩基配列ではない変なバイト文字が入っていたので削除
zcat 321/ERR1883431.fasta.gz|awk 'NR%2==1{print $0} NR%2==0{gsub(/[^a-zA-Z]/,"",$0); print $0}'|gzip -c > ERR1883431.fasta.gz
mv ERR1883431.fasta.gz 321

grep gz check3.txt|while read i; do 
 dir=`dirname $i`
 srr=`basename $i .fasta.gz`
 cd $dir
 qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr";
 cd ..
done

エラー調査4回目

grep gz check3.txt|while read i; do
 k=`dirname $i`
 j=`basename $i .fasta.gz`
 if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $i; fi
done > check4.txt

ヒットしたリードが少ないサンプルを調査

find [1-9]*|grep txt$|xargs -I{} -P 6 bash -c "bash run-check-hit.sh {}" |tee check-reads.txt

FASTQ大きめに取得(DDBJスパコンで)

mkdir db5-2
cd db5-2
ln -s ../db5/sralist.new.txt.gz .
zcat sralist.new.txt.gz |awk -F'\t' 'FILENAME==ARGV[1]{split($1,arr,"/"); split(arr[2],arr2,"."); reads[arr2[1]]=$2; n[arr2[1]]=arr[1]} FILENAME==ARGV[2] && $1 in n{print n[$1]"\t"$1"\t"$2"\t"reads[$1]}' check-reads.txt /dev/stdin > check-reads.txt2
#check-reads.txtの中身は「106/ERR6164986.txt      100000」など
split -l 200 check-reads.txt2 split-check-reads_ 
mkdir -p log
ln -s ../db5/run-get-reads.sh .
for i in split-check-reads_*; do qsub -j y -o log/getreads.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i; done'; done

失敗探し

#アウトプットがないファイルを探す
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2] && !($2 in a){print $0}' /dev/stdin <(cat split-check-reads_*) > error.job
mkdir rerun
awk -F'\t' -v block=1 '{n=int((NR-1)/block)+1; print $0 > "rerun/split."n}' error.job
for i in rerun/split.*; do qsub -j y -o log/getreads2.$i -N getreads ~/qsub8GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i; done'; done

もう一度

#アウトプットがないファイルを探す
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2] && !($2 in a){print $0}' /dev/stdin <(cat rerun/split.*) > error2.job
#mkdir rerun2
#awk -F'\t' -v block=1 '{n=int((NR-1)/block)+1; print $0 > "rerun2/split."n}' error2.job
#for i in rerun2/split.*; do qsub -j y -o log/getreads3.$i -N getreads ~/qsub8GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i; done'; done

転送用に圧縮

for i in $(seq 1 $(tail -n 1 `ls split-check-reads_* -d|tail -n 1`|cut -f 1)); do qsub -j y -o log/tarreads.$i -N tarreads ~/qsub2GB.sh tar cvf $i.tar $i; done

#tail -n 1 log/tarreads.*|grep ": 0"|wc -l #でsplit.*の数と一致するか調べる

m32sで大きいファイル解析

cd ~/work3/metasearch/2023/db5-2
cp ../db4-2/run-blast3-4.sh .
for i in [1-9]*.tar; do
 tar vxf $i
 j=`basename $i .tar`
 cd $j;
 ls *.fasta.gz|while read i2; do
  srr=`basename $i2 .fasta.gz`;
  qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr";
 done;
 cd ..;
done

#iryoでは
for i in 2*.tar; do
 tar vxf $i
 j=`basename $i .tar`
 cd $j;
 ls *.fasta.gz|while read i2; do
  srr=`basename $i2 .fasta.gz`;
  qsub -p -1000 -j y -l mem_req=0G -pe def_slot 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr";
 done;
 cd ..;
done

SRRダウンロード 2024/01/18

metagenomeのSRX一覧をダウンロードする。

https://www.ncbi.nlm.nih.gov/sra/?term=metagenome

を開いて、Send to → File → Summaryをダウンロードする。本当はRunInfo形式が良いのだけど、それだと10万件しかダウンロードできない。

ダウンロードしたsra_result_2024-01-19.csvをDDBJスパコンに転送し、SRRの場所を抜き出す。

more sra_result_2024-01-19.csv |cut -f 1 -d,|tail -n+2|sed 's/^"//; s/"$//'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1; n++} FILENAME==ARGV[2] && $8 in a{print $0; m++} END{print n,m,m/n*100 > "sralist.info"}' /dev/stdin /usr/local/resources/dra/meta/list/sralist > sralist.txt
#4127550件中3168212件ファイルあり76.7577%

しかし、WEBブラウザ経由でダウンロードしたsra_result.csvは途中で止まっていることもあるみたい。何度かダウンロードすると全部ダウンロードできそう。 もしくはコマンドラインでSRRリストを取得するには

docker run -t --rm ncbi/blast:2.13.0 bash -c "esearch -db sra -query metagenome|efetch -format runinfo"|cut -f 1 -d,|tail -n+2 > ~/work3/metasearch/2023/db4/sra_runinfo.txt

cat sra_runinfo.txt |cut -f 1 -d,|tail -n+2|sed 's/^"//; s/"$//'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1; n++} FILENAME==ARGV[2] && $7 in a{print $0; m++} END{print n,m,m/n*100 > 'sralist.info'}' /dev/stdin /usr/local/resources/dra/meta/list/sralist > sralist.txt

#metasearchでサンプル情報を取得する用のデータ
#/usr/local/resources/dra/meta/list/sralist -> sralist.2024-02-20
more sra_result_2024_01-19.csv|awk -F'","' '{print $1"\t"$2"\t"$3"\t"$7}'|sed 's/^"//' > sra_result_2024_01-19.csv.tsv
awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&$8 in a{b[$8]=$7} FILENAME==ARGV[3]{OFS="\t"; $1=$1"\t"b[$1]; print $0}' sra_result_2024_01-19.csv.tsv sralist.2024-02-20 ./sra_result_2024_01-19.csv.tsv > ./sra_result_2024_01-19.csv.tsv2
more ./sra_result_2024_01-19.csv.tsv2|awk -F'\t' '$2!=""' > sra_result_2024_01-19.csv.tsv3

追加すべきデータを抽出

cat sralist.txt |sed 's%/ddbj_database%/usr/local/resources%'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($1 in a){print $0}' ~/meta/sra_result.csv.run.ddbj2 /dev/stdin > sralist.new.txt
cat sralist.new.txt|cut -f 1|xargs ls > sralist.new.exist.txt
cat ~/meta/sra_result.csv.run.ddbj2 sralist.new.exist.txt > sralist.alltrial.txt
cat sralist.txt |sed 's%/ddbj_database%/usr/local/resources%'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2] && $1 in a{print $0}' sralist.new.exist.txt /dev/stdin > sralist.new.exist.full.txt

300リードずつ抽出する

awk -F'\t' -v block=5000 '{n=int((NR-1)/block)+1; print n"\t"$7"\t"$1 > "split."n}' sralist.new.exist.full.txt
mkdir -p log
for i in split.*; do qsub -j y -o log/getreads.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done
i=$2 #SRR ID
dir=$1 #1 output dir
path=$3 #/.sra
n=$4 #300 output reads

reads_down=300000 #Nを除去した後で最小リード長以上のリードを何リードランダム抽出に回すか。ペアエンドの場合は1ペアリード=1リードと計算
reads_max_down=`expr $reads_down '*' 40` #とりあえずダウンロードするリード数。
min_len=30

j=$i
k=$path #`grep $j$'\t' sralist.dir.txt|cut -f 3`
if [ "$k" != "" ]; then
 echo $k
 mkdir -p $dir
 if [ `file $k|grep gzip|wc -l` = 1 ]; then
  echo bam #DDBJのERRの多くは拡張子.sraなのにunmapped bam形式みたい?しかもペアエンドじゃない時?にリードが順向き逆向きの2回出てくる。リード名が重複してもこのスクリプトだと重複して最終的に出てくる。
  samtools fastq $k|awk '{if(NR%4==1){if($1!~"/1$"&&$1!~"/2$"){$1=$1"/1"}}; print $0}'|paste - - - -|head -n $reads_max_down > /tmp/$j
 else
  echo sra
  fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $k|paste - - - -|head -n $reads_max_down > /tmp/$j
 fi
 cat /tmp/$j |awk -F'\t' -v reads_down=$reads_down -v min_len=$min_len -v seed="${RANDOM}" '
  $2~"[0-3]"{exit} #for solid sequencer
  BEGIN{srand(seed)}
  { #@SRR052204.4/4 などと/1~/4の場合がありうる
   str=$2; gsub(/[^ACGTacgt]/,"",str);
   if(length(str)>=min_len){
    split($1,arr," "); split(arr[1],arr2,"/");
    if(arr2[1]!=old){
     old=arr2[1]
     n++
     if(n>1){
      print rand()"\t"old"\t"r1"\t"r2
     }
     r1=""
     r2=""
     if(n>reads_down){exit}
    }
    if(length($2)>length(r1)){r2=r1; r1=$2}
    else if(length($2)>length(r2)){r2=$2}
   }
  }
  END{print rand()"\t"old"\t"r1"\t"r2}
 ' > /tmp/$j.2
 sort --parallel=1 --buffer-size=1G /tmp/$j.2|head -n $n|cut -f 2-|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2"; print $3}'| gzip -c > $dir/$j.fasta.gz;
 rm /tmp/$j /tmp/$j.2
else
 echo No sra
fi

失敗探し

#失敗したジョブを探す
for i in getreads.o*; do if [ `tail -n 1 $i|grep "CMD_FIN_STATUS: 0"|wc -l` = 0 ]; then echo $i; fi; done | tee error.log

#アウトプットがないファイルを探す
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($2 in a){print $0}' /dev/stdin <(cat split.*) > error.job
mkdir rerun
awk -F'\t' -v block=100 '{n=int((NR-1)/block)+1; print $0 > "rerun/split."n}' error.job
for i in rerun/split.*; do qsub -j y -o log/getreads2.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done

もう一度

#アウトプットがないファイルを探す
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($2 in a){print $0}' /dev/stdin <(cat split.*) > error2.job
mkdir rerun2
awk -F'\t' -v block=1 '{n=int((NR-1)/block)+1; print $0 > "rerun2/split."n}' error2.job
for i in rerun2/split.*; do qsub -j y -o log/getreads3.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done

転送用に圧縮

mkdir -p log
for i in $(seq 1 `ls split.*|wc -l`); do qsub -j y -o log/tarreads.$i -N tarreads ~/qsub2GB.sh tar cvf $i.tar $i; done

m32sで

cd ~/work3/metasearch/2023/db4
rsync -av --progress ddbj:db4/*.tar .

for i in [1-9]*.tar; do
 tar vxf $i
 j=`basename $i .tar`
 cd $j;
 ls *.fasta.gz|while read i2; do
  srr=`basename $i2 .fasta.gz`;
  qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr";
 done;
 cd ..;
done
srr=$1
dir=$HOME/work3/metasearch/2023/db

db=/suikou/files/m768-data2/2023jissyu/nanopore/db/mergedDB.maskadaptors.fa
bitscore=100
top=0.99

set -eux
set -o pipefail

zcat $srr.fasta.gz | blastn -db $db -query /dev/stdin -num_threads 4 -outfmt 6 -max_target_seqs 500 |
 awk -F'\t' '{split($1,arr,"/");
  if(arr[1]!=old){for(hit in data){temp[hit]=data[hit]["1"]+data[hit]["2"]}; PROCINFO["sorted_in"]="@val_num_desc"; for(hit in temp){print old"\t"hit"\t"temp[hit]}; old=arr[1]; delete data; delete temp};
  if(data[$2][arr[2]]==0){data[$2][arr[2]]=$12}}' |
 awk -F'\t' '$3>'$bitscore'{if(a[$1]==1){if($3>=topbit*'$top'){print $0}}else{a[$1]=1; topbit=$3; print $0}}' |
 awk -F'\t' 'FILENAME==ARGV[1]{name[$1]=$2} FILENAME==ARGV[2]{print name[$2]"\t"$0}' $db.path /dev/stdin |
 awk -F'\t' '
function searchLCA(data,  i, j, res, res2, str, n, stopflag){
 for(i in data){
  if(n==0){n=split(i,res,";")}
  else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}}
 }
 if(res[1]!=""){str=res[1]}
 else{
  #i: taxonomy path
  #葉緑体と植物の18Sは相同性が高いみたいなのでそれが混ざるときは葉緑体を優先させる
  chloroplast=0
  delete datachloro
  for(i in data){
   if(i~"^Bacteria;Cyanobacteria;Cyanobacteriia;Chloroplast;"){chloroplast++; datachloro[i]=1}
  }
  if(chloroplast>0){
   n2=0
   for(i in datachloro){
    if(n2==0){n2=split(i,res,";")}
    else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}}
   }
  }
 }
 if(res[1]!=""){str=res[1]}
 else{
  str="unknown"; stopflag=1
 };
 for(i=2;i<=n;i++){if(stopflag==0 && res[i]!=""){str=str";"res[i]}else{stopflag=1}}
 return str;
}
{
 if($2!=old){if(old!=""){print searchLCA(data)"\t"oldstr}; delete data; data[$1]=1; old=$2; oldstr=$0}
 else{data[$1]=1}
}
END{if(length(data)>0){print searchLCA(data)"\t"oldstr}}
' > $srr.txt

awk -F'\t' '{cnt[$1]++} END{PROCINFO["sorted_in"]="@val_num_desc"; for(i in cnt){print i"\t"cnt[i]}}' $srr.txt > $srr.result

エラー調査

for k in *.tar; do k=`basename $k .tar`; echo $k; ls $k|grep gz$|while read i; do j=`basename $i .fasta.gz`; if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $k/$i; fi; done; done > check.txt

もしエラーがあれば

grep gz check.txt|while read i; do 
 dir=`dirname $i`
 srr=`basename $i .fasta.gz`
 cd $dir
 qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-2.sh $srr";
 cd ..
done

ヒットしたリードが少ないサンプルを調査

find [1-9]*|grep txt$|xargs -I{} -P 6 bash -c "bash run-check-hit.sh {}" > check-reads.txt
#run-check-hit.sh
file=$1 #1000/SRR16966126.txt

n1=300
n_min=200
n_max=100000
n_req=300

id=`basename $file .txt`
dir=`dirname $file`
l=`cat $file|wc -l`

if [ $l -lt $n_min ]; then
 if [ `zcat $dir/$id.fasta.gz |grep "^>"|sed 's%/.*%%' | wc -l` = $n1 ]; then
  if [ $l = 0 ];then
   k=$n_max
  else
   k=`awk 'BEGIN {print int('$n_req'/'$l'*'$n1')}'`
  fi
  echo $file$'\t'$k
 fi
fi

DDBJスパコンで

more /usr/local/resources/dra/meta/list/sralist |awk -F'\t' '{print $7"\t"$1}'|sed 's%ddbj_database%usr/local/resources%' > sralist.txt
sort sralist.txt > sralist.sorted.txt
split -l 200 check-reads.txt split-check-reads_
for i in split-check-reads_*; do qsub -j y -N getreads ./qsub.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i; done'; done
#run-get-reads.sh                                                                                              
i=$1
n=$2

reads_down=300000 #Nを除去した後で最小リード長以上のリードを何リードランダム抽出に回すか。ペアエンドの場合は1ペアリード=1リードと計算
reads_max_down=`expr $reads_down '*' 40` #とりあえずダウンロードするリード数。
min_len=30

j=`echo "$i"|cut -f 2 -d/|cut -f 1 -d.`
dir=`echo "$i"|cut -f 1 -d/`
#k=`look $j sralist.sorted.txt|cut -f 2` #lookは2GBだとメモリー不足みたい
k=`grep $j$'\t' sralist.sorted.txt|cut -f 2`
if [ "$k" != "" ]; then
 echo $k
 mkdir -p $dir
 if [ `file $k|grep gzip|wc -l` = 1 ]; then
  echo bam #DDBJのERRの多くは拡張子.sraなのにunmapped bam形式みたい?しかもペアエンドじゃない時?にリードが順向き逆向きの2回出てくる。リード名が重複してもこのスクリプトだと重複して最終的に出てくる。
  samtools fastq $k|awk '{if(NR%4==1){if($1!~"/1$"&&$1!~"/2$"){$1=$1"/1"}}; print $0}'|paste - - - -|head -n $reads_max_down > /tmp/$j
 else
  echo sra
  fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $k|paste - - - -|head -n $reads_max_down > /tmp/$j
 fi
 cat /tmp/$j |awk -F'\t' -v reads_down=$reads_down -v min_len=$min_len -v seed="${RANDOM}" '
  $2~"[0-3]"{exit} #for solid sequencer
  BEGIN{srand(seed)}
  { #@SRR052204.4/4 などと/1~/4の場合がありうる
   str=$2; gsub(/[^ACGTacgt]/,"",str);
   if(length(str)>=min_len){
    split($1,arr," "); split(arr[1],arr2,"/");
    if(arr2[1]!=old){
     old=arr2[1]
     n++
     if(n>1){
      print rand()"\t"old"\t"r1"\t"r2
     }
     r1=""
     r2=""
     if(n>reads_down){exit}
    }
    if(length($2)>length(r1)){r2=r1; r1=$2}
    else if(length($2)>length(r2)){r2=$2}
    reads[arr2[1]][arr2[2]]=$2;
   }
  }
  END{print rand()"\t"old"\t"r1"\t"r2}
 ' > /tmp/$j.2
 sort --parallel=1 --buffer-size=1G /tmp/$j.2|head -n $n|cut -f 2-|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2"; print $3}'| gzip -c > $dir/$j.fasta.gz;
 rm /tmp/$j /tmp/$j.2
else
 echo No sra
fi

失敗探し

#失敗したジョブを探す
for i in getreads.o*; do if [ `tail -n 1 $i|grep "CMD_FIN_STATUS: 0"|wc -l` = 0 ]; then echo $i; fi; done | tee error.log

#アウトプットがないファイルを探す
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]{split($1,arr,"/"); split(arr[2],arr2,"."); if(!(arr2[1] in a)){print $0}}' /dev/stdin <(cat split-check-reads_*) > error.job
mkdir rerun
awk -F'\t' -v block=100 '{n=int((NR-1)/block)+1; print $0 > "rerun/split."n}' error.job
for i in rerun/split.*; do qsub -j y -o log/getreads2.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i; done'; done

もう一度

#アウトプットがないファイルを探す
find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]{split($1,arr,"/"); split(arr[2],arr2,"."); if(!(arr2[1] in a)){print $0}}' /dev/stdin <(cat rerun/split.*) > error2.job
mkdir rerun2
awk -F'\t' -v block=1 '{n=int((NR-1)/block)+1; print $0 > "rerun2/split."n}' error2.job
for i in rerun2/split.*; do qsub -j y -o log/getreads3.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i; done'; done

m32sで

for j in [1-9]*; do cd $j;
 ls *.fasta.gz|while read i2; do
  srr=`basename $i2 .fasta.gz`;
  qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr";
 done;
 cd ..;
done

エラー調査

for k in [1-9]*; do echo $k; ls $k|grep gz$|while read i; do j=`basename $i .fasta.gz`; if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $k/$i; fi; done; done | tee check.txt
grep gz check.txt|while read i; do 
 dir=`dirname $i`
 srr=`basename $i .fasta.gz`
 cd $dir
 qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr";
 cd ..
done

ACCESSION->DEFINITION取得

#cd work3/metasearch/2023/acc2desc
curl https://ftp.ncbi.nih.gov/genbank/|grep seq.gz|sed 's/<a[^>]*>//; s/<.*//' > seq.list
cat seq.list |while read i; do curl https://ftp.ncbi.nih.gov/genbank/$i|gzip -dc|awk '$0~"^LOCUS "{if(acc!=""){print acc"\t"des}; acc=""; des=""} $0~"^ACCESSION "{acc=$2} f==1&&substr($0,1,1)!=" "{f=0} $0~"^DEFINITION "{f=1; des=""} f==1{des=des" "$0} END{if(acc!=""){print acc"\t"des}}'|sed 's/ \+/ /g; s/\t DEFINITION/\t/'| gzip -c > $i; done

10MB/s出るなら、10日くらいでは終わりそうだけど、DDBJにミラーがあったので、そっちを利用

cd acc2desc/
dir=/usr/local/resources/mirror/ftp.ncbi.nih.gov/genbank/genbank.20221109080028916/; ls $dir|grep seq.gz$|while read i; do zcat $dir/$i|awk '$0~"^LOCUS "{if(acc!=""){print acc"\t"des}; acc=""; des=""} $0~"^ACCESSION "{acc=$2} f==1&&substr($0,1,1)!=" "{f=0} $0~"^DEFINITION "{f=1; des=""} f==1{des=des" "$0} END{if(acc!=""){print acc"\t"des}}'|sed 's/ \+/ /g; s/\t DEFINITION/\t/' | gzip -c > $i; done
dir=/usr/local/resources/mirror/ftp.ncbi.nih.gov/genbank/genbank.20221109080028916/
ls $dir|grep seq.gz$|while read i; do qsub -N $i run-acc2desc.sh $i; done
#5000個のqsub上限を超えるので分けたほうが無難
#!/bin/bash
#$ -S /bin/bash
#$ -cwd
#$ -l d_rt=01:00:00
#$ -l s_rt=01:00:00
#$ -pe def_slot 1
#$ -l mem_req=2G,s_vmem=2G
#$ -j y

i=$1
dir=/usr/local/resources/mirror/ftp.ncbi.nih.gov/genbank/genbank.20221109080028916/
zcat $dir/$i|awk '
 $0~"^LOCUS "{if(acc!=""){print acc"\t"des}; acc=""; des=""}
 $0~"^ACCESSION "{acc=$2}
 f==1&&substr($0,1,1)!=" "{f=0}
 $0~"^DEFINITION "{f=1; des=""}
 f==1{des=des" "$0}
 END{if(acc!=""){print acc"\t"des}}'|sed 's/ \+/ /g; s/\t DEFINITION/\t/' | gzip -c > $i
#cd work3/metasearch/2023/db
find ../acc2desc/|grep gz$|xargs cat > acc2desc.gz

zcat acc2desc.gz | awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2] && $1 in a{print $0}' ~/work3/metasearch/2023/db/create_blast_db_tempdir/mergedDB.fa.maskadaptors.nucl_gb.accession2taxid /dev/stdin > acc2desc.maskadaptors
sed 's/;/,/g' acc2desc.maskadaptors > acc2desc.maskadaptors2

バクテリアは基本SILVAのみを使用したDBを使用 2023/11/29

ペアエンドのbit scoreを合算して使用

ln -s ../db/??.tar .
for i in ??.tar; do
 j=`basename $i .tar`;
 if [ ! -e $j ]; then
  tar vxf $i; cd $j;
  ls *.fasta.gz|while read i2; do
   srr=`basename $i2 .fasta.gz`;
   qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-2.sh $srr";
  done;
  cd ..;
 fi;
done
srr=$1
dir=$HOME/work3/metasearch/2023/db

db=/suikou/files/m768-data2/2023jissyu/nanopore/db/mergedDB.maskadaptors.fa
bitscore=100
top=0.99

zcat $srr.fasta.gz | blastn -db $db -query /dev/stdin -num_threads 4 -outfmt 6 -max_target_seqs 500 |
 awk -F'\t' '{split($1,arr,"/");
  if(arr[1]!=old){for(hit in data){temp[hit]=data[hit]["1"]+data[hit]["2"]}; PROCINFO["sorted_in"]="@val_num_desc"; for(hit in temp){print old"\t"hit"\t"temp[hit]}; old=arr[1]; delete data; delete temp};
  if(data[$2][arr[2]]==0){data[$2][arr[2]]=$12}}' |
 awk -F'\t' '$3>'$bitscore'{if(a[$1]==1){if($3>=topbit*'$top'){print $0}}else{a[$1]=1; topbit=$3; print $0}}' |
 awk -F'\t' 'FILENAME==ARGV[1]{name[$1]=$2} FILENAME==ARGV[2]{print name[$2]"\t"$0}' $db.path /dev/stdin |
 awk -F'\t' '
function searchLCA(data,  i, j, res, res2, str, n, stopflag){
 for(i in data){
  if(n==0){n=split(i,res,";")}
  else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}}
 }
 if(res[1]!=""){str=res[1]}else{str="unknown"; stopflag=1};
 for(i=2;i<=n;i++){if(stopflag==0 && res[i]!=""){str=str";"res[i]}else{stopflag=1}}
 return str;
}
{
 if($2!=old){if(old!=""){print searchLCA(data)"\t"oldstr}; delete data; data[$1]=1; old=$2; oldstr=$0}
 else{data[$1]=1}
}
END{if(length(data)>0){print searchLCA(data)"\t"oldstr}}
' > $srr.txt

awk -F'\t' '{cnt[$1]++} END{PROCINFO["sorted_in"]="@val_num_desc"; for(i in cnt){print i"\t"cnt[i]}}' $srr.txt > $srr.result

エラー調査

for k in *.tar; do k=`basename $k .tar`; echo $k; ls $k|grep gz$|while read i; do j=`basename $i .fasta.gz`; if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $k/$i; fi; done; done > check.txt
grep gz check.txt|while read i; do 
 dir=`dirname $i`
 srr=`basename $i .fasta.gz`
 cd $dir
 qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-2.sh $srr";
 cd ..
done

BLASTトップヒットの辞書順で一番小さいヒットを出力 2023/9/12

for i in *.tar; do
 j=`basename $i .tar`;
 if [ ! -e $j ]; then
  tar vxf $i; cd $j;
  ls *.fasta.gz|while read i2; do
   srr=`basename $i2 .fasta.gz`;
   qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast2.sh $srr";
  done;
  cd ..;
 fi;
done
srr=$1
dir=$HOME/work3/metasearch/2023/db

zcat $srr.fasta.gz | blastn -db $dir/create_blast_db_tempdir/database.fasta -query /dev/stdin -num_threads 4 -outfmt 6 -max_target_seqs 500 |awk -F'\t' -v min_bit=100 '{split($1,arr,"/"); if(data[arr[1]][$2][arr[2]]==0){data[arr[1]][$2][arr[2]]=$12}} END{for(read in data){bit=0; for(hit in data[read]){temp=data[read][hit]["1"]+data[read][hit]["2"]; if(temp>bit){bit=temp}}; if(bit>=min_bit){delete out; for(hit in data[read]){temp=data[read][hit]["1"]+data[read][hit]["2"]; if(temp==bit){out[hit]=1}}; PROCINFO["sorted_in"]="@ind_str_asc"; for(hit in out){print read"\t"hit"\t"temp; break}}}}'|awk -F'\t' 'FILENAME==ARGV[1]{acc2tax[$1]=$3} FILENAME==ARGV[2]{tax2name[$1]=$2} FILENAME==ARGV[3]{acc2desc[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"."); print $0"\t"tax2name[acc2tax[arr[1]]]";acc:"arr[1]":"acc2desc[arr[1]]}' $dir/create_blast_db_tempdir/mergedDB.fa.maskadaptors.nucl_gb.accession2taxid $dir/create_blast_db_tempdir/mergedDB.fa.maskadaptors.names.dmp $dir/acc2desc.maskadaptors2 /dev/stdin|sed 's/\troot;cellular organisms;/\t/' > $srr.txt

awk -F'\t' '{cnt[$4]++} END{PROCINFO["sorted_in"]="@val_num_desc"; for(i in cnt){print i"\t"cnt[i]}}' $srr.txt > $srr.result

SRAダウンロード+blast 2023/8/5

for n in `seq 10 20`; do
 mkdir $n; cd $n
 awk -F'\t' -v n=$n 'NR>(n-1)*10000&&NR<=n*10000' ~/oldhome/work2/metagenome-db/2022/sra_result.csv.run|cut -f 2|while read srr; do
  if [ ! -e $srr.result ]; then echo ../run-download.sh $srr; fi
 done|xargs -I{} -P5 bash -c "{}"
 cd ..
done
#run-download.sh
srr=$1

fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $srr|paste - - - -|awk -F'\t' -v reads_down=100000 -v reads_max_down=reads_down*40 -v min_len=30 'NR>reads_max_down||$2~"[0-3]"{exit} {str=$2; gsub(/[^ACGTacgt]/,"",str); if(length(str)>=min_len){spl
it($1,arr," "); split(arr[1],arr2,"/"); reads[arr2[1]][arr2[2]]=$2; n++; if(n>=reads_down){exit}}} END{for(i in reads){print i"\t"reads[i]["1"]"\t"reads[i]["2"]}}'|shuf|head -n 300|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2";
print $3}' | gzip -c > $srr.fasta.gz && qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast.sh $srr"
#run-blast.sh
srr=$1
dir=$HOME/work3/metasearch/2023/db

zcat $srr.fasta.gz | blastn -db $dir/create_blast_db_tempdir/database.fasta -query /dev/stdin -num_threads 4 -outfmt 6 -max_target_seqs 500 |awk -F'\t' -v percent=0.99 -v min_bit=100 '{split($1,arr,"/"); if(data[arr[1]][$2][arr[2]]==0){data[arr[1]][$2][arr[2]]=$12}} END{
for(read in data){bit=0; for(hit in data[read]){temp=data[read][hit]["1"]+data[read][hit]["2"]; if(temp>bit){bit=temp}}; if(bit>=min_bit){for(hit in data[read]){temp=data[read][hit]["1"]+data[read][hit]["2"]; if(temp>=bit*percent){print read"\t"hit"\t"(data[read][hit]["1
"]+data[read][hit]["2"])}}}}}'|awk -F'\t' 'FILENAME==ARGV[1]{acc2tax[$1]=$3} FILENAME==ARGV[2]{tax2name[$1]=$2} FILENAME==ARGV[3]{split($2,arr,"."); print $0"\t"tax2name[acc2tax[arr[1]]]}' $dir/create_blast_db_tempdir/mergedDB.fa.maskadaptors.nucl_gb.accession2taxid $dir
/create_blast_db_tempdir/mergedDB.fa.maskadaptors.names.dmp /dev/stdin |awk -F'\t' '{if($1!=old){if(old!=""){ORS=""; print old"\t"; print items[1]; for(i=2;i<=length(items);i++){if(items[i]==""){break}; print ";"items[i]}; print "\n"}; old=$1; split($4,items,";")}else{sp
lit($4,arr,";"); for(i=1;i<=length(items);i++){if(items[i]!=arr[i]){items[i]=""; break}}}} END{ORS=""; print old"\t"; print items[1]; for(i=2;i<=length(items);i++){if(items[i]==""){break}; print ";"items[i]}; print "\n"}'|sed 's/\troot;cellular organisms;/\t/' > $srr.txt

cat $srr.txt|awk -F'\t' '{cnt[$2]++} END{for(i in cnt){print i"\t"cnt[i]}}'|sort -k2,2nr -t$'\t' > $srr.result

for DDBJ

more /usr/local/resources/dra/meta/list/sralist |cut -f 1,7|awk -F'\t' '{print $2"\t"$1}'|sed 's%/ddbj_database%/usr/local/resources%'|sort > sralist.forlook
awk -F'\t' 'FILENAME==ARGV[1]{x[$2]=1} FILENAME==ARGV[2] && $1 in x{a[$1]=$2} FILENAME==ARGV[3]{print $0"\t"a[$2]}' ./sra_result.csv.run sralist.forlook sra_result.csv.run > sra_result.csv.run.ddbj
cat sra_result.csv.run.ddbj|awk -F'\t' '$3!=""{print $3}' > sra_result.csv.run.ddbj2

more sra_result.csv.run.ddbj2 |wc -l|awk '{print int(($0 - 1)/300)+1}'
#9388

qsub qsub-make-fasta.sh
# epyc.q & intel.qを使う場合。ただしメモリーが2Gくらいまででないと中々入らなそう。

#!/bin/bash
#$ -S /bin/bash
#$ -cwd
#$ -l d_rt=192:00:00
#$ -l s_rt=192:00:00
#$ -pe def_slot 1
#$ -l mem_req=2G,s_vmem=2G
#$ -t 1:9388

source ~/.bashrc

n=$SGE_TASK_ID

echo $n
mkdir -p $n;

cat sra_result.csv.run.ddbj2|awk -v n=$n 'NR>(n-1)*300&&NR<=n*300'|while read i; do
 m=`expr $m + 1`;
 j=`basename $i .sra`; echo $n $m $i;
 (if [ `file $i|grep gzip|wc -l` = 1 ]; then
  samtools fastq $i|awk '{if(NR%4==1){if($1!~"/1$"&&$1!~"/2$"){$1=$1"/1"}}; print $0}'
 else
  fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i
 fi )|paste - - - -| awk -F'\t' -v reads_down=100000 -v reads_max_down=reads_down*40 -v min_len=30 '
  NR>reads_max_down||$2~"[0-3]"{exit}
  {str=$2; gsub(/[^ACGTacgt]/,"",str); if(length(str)>=min_len){split($1,arr," "); split(arr[1],arr2,"/"); reads[arr2[1]][arr2[2]]=$2; n++; if(n>=reads_down){exit}}}
  END{for(i in reads){print i"\t"reads[i]["1"]"\t"reads[i]["2"]}}
 '|shuf|head -n 300|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2"; print $3}'| gzip -c > $n/$j.fasta.gz;
done
tar cvf $n.tar $n
rm -rf $n
# qsub-make-fasta-lowmem.sh

#!/bin/bash
#$ -S /bin/bash
#$ -cwd
#$ -l d_rt=01:00:00
#$ -l s_rt=01:00:00
#$ -pe def_slot 1
#$ -l mem_req=2G,s_vmem=2G
#$ -j y
# -t 121,126,127,128,129,130,131,132,133,134,135,136,137,138,18,29,35,49,87,92,
#$ -t 141:9388

source ~/.bashrc

n=$SGE_TASK_ID

echo $n

reads_down=100000
reads_max_down=`expr $reads_down '*' 40`
min_len=30

mkdir -p $n;

cat sra_result.csv.run.ddbj2|awk -v n=$n 'NR>(n-1)*300&&NR<=n*300'|while read i; do
 m=`expr $m + 1`;
 j=`basename $i .sra`; echo $n $m $i;
 if [ `file $i|grep gzip|wc -l` = 1 ]; then
  samtools fastq $i|awk '{if(NR%4==1){if($1!~"/1$"&&$1!~"/2$"){$1=$1"/1"}}; print $0}'|paste - - - -|head -n $reads_max_down > /tmp/$j
 else
  fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i|paste - - - -|head -n $reads_max_down > /tmp/$j
 fi
 cat /tmp/$j |awk -F'\t' -v reads_down=$reads_down -v min_len=$min_len -v seed="${RANDOM}" '
  $2~"[0-3]"{exit} #for solid sequencer
  {str=$2; gsub(/[^ACGTacgt]/,"",str); if(length(str)>=min_len){split($1,arr," "); split(arr[1],arr2,"/"); reads[arr2[1]][arr2[2]]=$2; n++; if(n>=reads_down){exit}}}
  END{srand(seed); for(i in reads){print rand()"\t"i"\t"reads[i]["1"]"\t"reads[i]["2"]}}
 ' > /tmp/$j.2
 sort /tmp/$j.2|head -n 300|cut -f 2-|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2"; print $3}'| gzip -c > $n/$j.fasta.gz;
 rm /tmp/$j /tmp/$j.2
done
tar cvf $n.tar $n
rm -rf $n
# 1時間以内かつメモリー7.5GBくらいまでならshort.qが良さそう
# ただし、/usr/local/resorcesなどにはアクセスできないみたい

#!/bin/bash
#$ -S /bin/bash
#$ -cwd
#$ -l short
#$ -l d_rt=01:00:00
#$ -l s_rt=01:00:00
#$ -pe def_slot 1
#$ -l mem_req=7G,s_vmem=7G
#$ -t 107:9388

suikouに戻って

rsync -av --progress ddbj:meta/*.tar .
for i in *.tar; do
 j=`basename $i .tar`;
 if [ ! -e $j ]; then
  tar vxf $i; cd $j;
  ls *.fasta.gz|while read i2; do
   srr=`basename $i2 .fasta.gz`;
   qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast.sh $srr";
  done;
  cd ..;
 fi;
done

途中で異常終了した解析をやり直し

for i in `ls *.tar|head -n 4000|tail -n 2000`; do
 j=`basename $i .tar`;
 if [ -e $j ]; then
  cd $j;
  ls *.fasta.gz|while read i2; do
   srr=`basename $i2 .fasta.gz`;
   if [ ! -e $srr.result ]; then
    rm -f $srr.log
    qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast.sh $srr";
   fi;
  done;
  cd ..;
 fi;
done

DDBJに無いデータ

#at ~/work3/metasearch/2023/db
more sra_result.csv.run.ddbj |awk -F'\t' '$3==""' > sra_result.csv.run.ddbj.notinddbj

for n in `seq 1 2`; do
 mkdir n$n; cd n$n
 awk -F'\t' -v n=$n 'NR>(n-1)*10000&&NR<=n*10000' ../sra_result.csv.run.ddbj.notinddbj|cut -f 2|while read srr; do
  if [ ! -e $srr.result ]; then echo ../run-download.sh $srr; fi
 done|xargs -I{} -P5 bash -c "{}"
 cd ..
done

SRAメタデータダウンロード 2022/8/4

#wget https://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/NCBI_SRA_Metadata_20220803.tar.gz <=おそらくこっちは直近1年分?とか
wget https://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/NCBI_SRA_Metadata_Full_20220718.tar.gz


zcat NCBI_SRA_Metadata_Full_20220718.tar.gz |awk '$1=="<RUN"{a=$0; sub(".*<RUN .*accession=\"","",a); sub("\".*","",a)} $1=="</RUN>"{if(b==1){print a}; a=0; b=0} a!=0&&$1=="<EXPERIMENT_REF"{b=1; sub(".*<EXPERIMENT_REF .*accession=\"","",$0); sub("\".*","",$0); a=a"\t"$0}' > NCBI_SRA_Metadata_Full_20220718.R2X #stringsコマンドを使うとüなどで改行されてしまうので使わない

metagenomeのSRX一覧をダウンロードする。

https://www.ncbi.nlm.nih.gov/sra/?term=metagenome

を開いて、Send to → File → Summaryをダウンロードする。本当はRunInfo形式が良いのだけど、それだと10万件しかダウンロードできない。

cut -f 1 -d , sra_result.csv |tail -n+2|sed 's/"//g'|awk -F'\t' 'FILENAME==ARGV[1]{X2R[$2]=$1} FILENAME==ARGV[2]{print $1"\t"X2R[$1]}' NCBI_SRA_Metadata_Full_20220718.R2X /dev/stdin | awk -F'\t' '$2!=""{print $0; n++} END{print NR"\t"n > "/dev/stderr"}' > sra_result.csv.run
#2,853,384件中2,831,286件有効

SRRダウンロード+blast 2022年版

::::::::::::::
run-batch-download.sh
::::::::::::::
#!/bin/bash

#bash run-batch-download.sh <dir name> <target row num>

csv="sra_result.csv.run"
numdir="$1"
i="$2"
readnum=10000

sra=`awk -F'\t' 'NR=='$i'{print $2}' "$csv"`;

fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $sra |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print "+"; pr
int old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; oldstr=str; old1=$1
; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n $(($readnum * 4))| awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' > $numdir/$sra.fasta;

cd $numdir
qsub -p -1000 -j y -l mem_req=0G -o $sra.log -N $sra ../qsubsh2 "nice -n 19 bash ../run-blast.sh $sra.fasta";


::::::::::::::
run-blast.sh
::::::::::::::
#!/bin/bash

set -x

##USAGE: bash program.sh [input.fasta]

wdir="$PWD"
cd /tmp

shopt -s expand_aliases;

alias time='/usr/bin/time -f "real time: %E (%e s)\nsystem time: %S s\nuser time:%U s\nmax memory: %M KB"'

rsync -av /suikou/files/m1536/mizobata.hideaki/work/mizowork/blastn_shotgun/db_merged.maskadaptors.fasta* /tmp/;
rsync -av /suikou/files/m1536/mizobata.hideaki/work/mizowork/blastn_shotgun/nucl_gb.accession2taxid.gz /tmp/;
rsync -av /suikou/files/m1536/mizobata.hideaki/work/mizowork/blastn_shotgun/names.dmp.sname.path.gz /tmp/;
rsync -av /suikou/files/m1536/mizobata.hideaki/work/mizowork/blastn_shotgun/blastn_taxname.txt /tmp/;
rsync -av /suikou/tool/ncbi-blast-2.12.0+/bin/blastn /tmp/
mv "$wdir"/$1 /tmp/

total=$(expr `cat /tmp/$1|wc -l` / 2)
time /tmp/blastn -db /tmp/db_merged.maskadaptors.fasta -query /tmp/$1 -num_threads 2 -outfmt 6 -max_target_seqs 500 > /tmp/$1.blastn

time awk -F"\t" -f /tmp/blastn_taxname.txt /tmp/$1.blastn <(zcat /tmp/nucl_gb.accession2taxid.gz) <(zcat /tmp/names.dmp.sname.path.gz) //tmp/$1.blastn | awk -F'\t' -v percent=0.99 '
#検索用の関数 関数内でローカル変数を使用するために、関数内で使用する全変数を引数の後ろ側につけておく。これはawkのいけてない仕様だそう。
function searchLCA(data,  i, j, res, res2, str, n, stopflag){
 #同じリードのデータを全て検索して共通していないパスを空白に置換しておく。
 for(i in data){
  if(n==0){n=split(i,res,";")}
  else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}}
 }
 if(res[1]!=""){str=res[1]}else{str="unknown"; stopflag=1};
 for(i=2;i<=n;i++){if(stopflag==0 && res[i]!=""){str=str";"res[i]}else{stopflag=1}}
 return str;
}
{
 #新しいリードに変わったら、いろんな変数を初期化しておく
 if($1!=old){if($2!=""){if(old!=""){print searchLCA(data)"\t"oldstr}; delete data; basescore=$12; data[$2]=1; old=$1; oldstr=$0}}
 #前の行と同じリードで、閾値以上(bitscoreがトップヒットの99%以上)であればデータ配列に突っ込む
 else if($12>=basescore*percent && $2!=""){data[$2]=1}
}
END{print searchLCA(data)"\t"oldstr}' | awk -F"\t" '{if($4>=95 && $5>=95){print $0}}' | awk -F"\t" -v total=$total '
 BEGIN{list["tax"]="num";}{if($1 in list){list[$1]=list[$1]+1;}else{list[$1]=1;}}
 END{
  cnt=0;
  delete list["tax"];
  PROCINFO["sorted_in"]="@val_num_desc";
  for(i in list){print list[i]"\t"i; cnt+=list[i]}
  print total-cnt"\tNo Hit"
 }
' > $1.summary;

mv $1.summary "$wdir"
rm /tmp/$1.blastn;
rm /tmp/$1

::::::::::::::
run-start-batch.sh
::::::::::::::
#!/bin/bash

#bash $0 <1万個単位の何番目か>

j=$1
beg=$((($j - 1) * 10000 + 1))
fin=$(($j * 10000))

csv=sra_result.csv.run
mkdir $j
for i in `seq $beg $fin`; do
 if [ ! $i -gt `cat $csv | wc -l` ]; then
  echo bash run-batch-download.sh "$j" "$i"
 fi
done|xargs -I{} -P4 bash -c "{}"

::::::::::::::
qsubsh2
::::::::::::::
#!/bin/bash
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 2

source ~/.bashrc

echo SGE_JOB_SPOOL_DIR: "$SGE_JOB_SPOOL_DIR"
echo CMD: "$*"
eval "$*"

echo CMD_FIN_STATUS: $?
cd /home/yoshitake.kazutoshi/work2/metagenome-db/2022

#一度だけ実行しておく
./vdb-config --interactive

bash run-start-batch.sh 2
#2,831,286件なので、1-284までを実行

for i in `seq 103 110`; do bash run-start-batch.sh $i; done
#失敗したのを再度実行
#@ work2/metagenome-db/2022
#リスト作成
for i in ? ?? ???; do if [ -e "$i" ]; then cd $i; ls --color=none|awk -v dir=$i '$0~".log$"{split($1,arr,"."); id[arr[1]]++} $0~".fasta.summary"{split($1,arr,"."); id2[arr[1]]++} END{for(i in id){if(!(i in id2)){print dir"\t"i}}}'; cd ..; fi; done > failed.list

#失敗したサーバの回数確認
more failed.list |while read i; do dir=`echo $i|awk '{print $1}'`; id=`echo $i|awk '{print $2}'`; head -n 1 $dir/$id.log|cut -f 7 -d /;done|sort |uniq -c|sort -nr

#再実行
more failed.list |while read i; do dir=`echo $i|awk '{print $1}'`; id=`echo $i|awk '{print $2}'`; bash run-batch-download_id.sh $dir $id;done
#!/bin/bash

#bash run-batch-download.sh <dir name> <target row id>

numdir="$1"
sra="$2"
readnum=10000

fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $sra |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print "+"; pr
int old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; oldstr=str; old1=$1
; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n $(($readnum * 4))| awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' > $numdir/$sra.fasta;

cd $numdir
rm $sra.log
qsub -p -1000 -j y -l mem_req=0G -o $sra.log -N $sra ../qsubsh2 "nice -n 19 bash ../run-blast.sh $sra.fasta";

1万リード以上を取得

# ~/work2/metagenome-db/2022over10k
./run-get-to-download-list.sh ../2022/1/|while read i; do n=`echo "$i"|cut -f 6`; if [ $n -gt 10000 ]; then dir=`echo "$i"|cut -f 1`; id=`echo "$i"|cut -f 2`; echo bash ./run-batch-download_id_over10k.sh $dir $id $n; fi; done|xargs -I{} bash -c {}

#まとめて
for i in `seq 16 100`; do if [ -e ../2022/$i ]; then ./run-get-to-download-list.sh ../2022/$i/; fi|while read i; do n=`echo "$i"|cut -f 6`; if [ $n -gt 10000 ]; then dir=`echo "$i"|cut -f 1`; id=`echo "$i"|cut -f 2`; echo bash ./run-batch-download_id_over10k.sh $dir $id $n; fi; done|xargs -I{} -P5 bash -c {}; done
                 
dir=$1
find $dir|grep .fasta.summary$|while read i; do awk -F'\t' -v dir=$(basename $dir) -v name=$(basename $i .fasta.summary) '{if($2=="No Hit"){nohit=$1}; cnt+=$1} END{if(cnt>0){p=nohit/cnt}else{p=0}; q=10000; if(cnt==10000&&(cnt - nohit)<1000){if(cnt - nohit == 0){q=1000*10000}else{q=int(1000/(cnt - nohit)*10000)}}; print dir"\t"name"\t"cnt"\t"nohit"\t"p"\t"q}' $i; done
#!/bin/bash

#bash run-batch-download.sh <dir name> <target row id>

numdir="$1" #91
sra="$2" #SRR13050880
readnum="$3" #10000

mkdir -p $numdir

fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $sra |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print "+"; print old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; oldstr=str; old1=$1; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n $(($readnum * 4))| awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' > $numdir/$sra.fasta;

cd $numdir
rm $sra.log
qsub -p -1000 -j y -l mem_req=0G -o $sra.log -N $sra ../qsubsh2 "nice -n 19 bash ../run-blast.sh $sra.fasta";

DDBJからまずはダウンロードするようにしてみると

for i in `seq 152 153`; do if [ -e ../2022/$i ]; then ./run-get-to-download-list.sh ../2022/$i/; fi|while read i; do n=`echo "$i"|cut -f 6`; if [ $n -gt 10000 ]; then dir=`echo "$i"|cut -f 1`; id=`echo "$i"|cut -f 2`; echo bash ./run-batch-download_id_over10k_ddbj.sh $dir $id $n; fi; done|xargs -I{} -P5 bash -c {}; done
#!/bin/bash

#bash run-batch-download.sh <dir name> <target row id>

numdir="$1" #91
sra="$2" #SRR13050880
readnum="$3" #10000

mkdir -p $numdir

baseftpdir=/tmp/a #ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/sralite/ByExp/litesra/SRX/SRX882/SRX8823480/SRR12323344/SRR12323344.sraのlitesraまでをcurlftpfsでマウントしたフォルダ
#curlftpfs ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/sralite/ByExp/litesra /tmp/a
url=`wget -O /dev/stdout https://ddbj.nig.ac.jp/resource/sra-run/$sra|grep "[.]sra"|grep -o litesra[^.]*[.]sra |sort|uniq|sed 's/litesra\///'|head -n 1`
if [ "$url" = "" ];then url=$sra; else url=$baseftpdir/$url;fi

fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $url |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print
 "+"; print old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; old
str=str; old1=$1; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n $(($readnum * 4))| awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' > $numdir/$sra.fasta;

cd $numdir
rm $sra.log
qsub -p -1000 -j y -l mem_req=0G -o $sra.log -N $sra /home/yoshitake.kazutoshi/work2/metagenome-db/2022/qsubsh2 "nice -n 19 bash ../run-blast.sh $sra.fasta";

SRAメタデータダウンロード

http://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/

から

http://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/NCBI_SRA_Metadata_Full_20201105.tar.gz #サンプル情報IDからサンプル情報詳細を得るためのファイル

http://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab #SRRのIDとサンプル情報ID、BioProject IDへの関連付けがされている

をダウンロード。tar.gzを普通に解凍するとファイルがおそらく1億個以上出来そうなので、下記のように一つのテキストファイルとして無理やり出力しておく。

zcat NCBI_SRA_Metadata_Full_20201105.tar.gz |strings > NCBI_SRA_Metadata_Full_20201105.tar.strings

SAMPLEとEXPERIMENTの情報だけ抜き出す。

grep -v "^[^< ]" NCBI_SRA_Metadata_Full_20201105.tar.strings|
 awk '
  $0~"^<SAMPLE_SET[ >]"{stage="s1"; print id"\t"title"\t"name; id=""; title=""; name=""}
  stage=="s1" && $0~"^  <SAMPLE[ >]"{stage="s2"}
  stage=="s2" && $0~"^    <IDENTIFIERS[ >]"{stage="s3"}
  stage=="s3" && $0~"^      <PRIMARY_ID[ >]"{sub(/.*<PRIMARY_ID>/,"",$0); sub(/<.*/,"",$0); id=$0}
  stage=="s3" && $0~"^    <TITLE[ >]"{sub(/.*<TITLE>/,"",$0); sub(/<.*/,"",$0); title=$0}
  stage=="s3" && $0~"^    <SAMPLE_NAME[ >]"{stage="s3-2"}
  stage=="s3-2" && $0~"^      <SCIENTIFIC_NAME[ >]"{sub(/.*<SCIENTIFIC_NAME./,"",$0); sub(/<.*/,"",$0); name=$0}
  $0~"^<EXPERIMENT_SET[ >]"{stage="e1"; print id"\t"title"\t"name; id=""; title=""; name=""}
  stage=="e1" && $0~"^  <EXPERIMENT[ >]"{stage="e2"}
  stage=="e2" && $0~"^    <IDENTIFIERS[ >]"{stage="e3"}
  stage=="e3" && $0~"^      <PRIMARY_ID[ >]"{sub(/.*<PRIMARY_ID>/,"",$0); sub(/<.*/,"",$0); id=$0}
  stage=="e3" && $0~"^    <TITLE[ >]"{sub(/.*<TITLE>/,"",$0); sub(/<.*/,"",$0); title=$0}
  stage=="e3" && $0~"^    <DESIGN[ >]"{stage="e3-2"}
  stage=="e3-2" && $0~"^      <DESIGN_DESCRIPTION[ >]"{sub(/.*<DESIGN_DESCRIPTION./,"",$0); sub(/<.*/,"",$0); name=$0}
  END{print id"\t"title"\t"name}
 ' > NCBI_SRA_Metadata_Full_20201105.tar.strings.sam-exp

それと、

https://ftp.ncbi.nlm.nih.gov/bioproject/

から

https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml #BioProject IDからBioProjectの詳細を得るためのファイル。

をダウンロード

SRRのダウンロード先URL一覧をダウンロードする。

https://www.ncbi.nlm.nih.gov/sra/?term=metagenome

を開いて、下記のようにmetagenomeで検索して、結果ファイルをRunInfo形式でダウンロードする。(2022年8月の時点ではこの方法だと10万件しかダウンロードできないのでまとめてダウンロードするのには不向き。Summary形式なら全件ダウンロードできそうだったけど、肝心のRUN情報が取れない。)

SRAデータダウンロード

more SraRunInfo.csv |sed 's/"[^"]*"//g'|awk -F, '$19=="ILLUMINA" || $19=="BGISEQ" || $19=="ION_TORRENT" || $19=="LS454" || $19=="OXFORD_NANOPORE" || $19=="PACBIO_SMRT"{print $1}' > SraRunInfo.csv.fordownload
split -l 10000 SraRunInfo.csv.fordownload SraId_
for i in SraId_*; do mkdir $i.fastq; done

j=SraId_aa; for i in `cat $j`; do echo $i; /suikou/tool/sratoolkit.2.10.8-centos_linux64/bin/fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i |awk '{if(NR%4==1){if($0~"/1$"){flag=1}else{flag=0}}; if(flag==1){print $0}}'|head -n 40000|gzip -c > $j.fastq/$i.fastq.gz; done 2>&1 | tee $j.log

SRAからのダウンロードに失敗した場合はEBIからダウンロード

j=SraId_al; for i in $j.fastq/*.fastq.gz; do n=`zcat $i|wc -l`; if [ $n = 0 ];then k=`basename $i .fastq.gz`; echo $k; curl http://`curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$k&result=read_run&fields=run_accession,fastq_ftp,fastq_md5,fastq_bytes"|tail -n 1|cut -f 2|cut -f 1 -d";"`|zcat|head -n 40000|gzip -c > $i; fi; done 2>&1 | tee $j.ebi.log

batchファイルで上の2つを連続実行

$ more run-download.sh
j=$1

for i in `cat $j`; do echo $i; /suikou/tool/sratoolkit.2.10.8-centos_linux64/bin/fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i |
awk '{if(NR%4==1){if($0~"/1$"){flag=1}else{flag=0}}; if(flag==1){print $0}}'|head -n 40000|gzip -c > $j.fastq/$i.fastq.gz; done 2>&1 | tee $j.log

for i in $j.fastq/*.fastq.gz; do n=`zcat $i|wc -l`; if [ $n = 0 ];then k=`basename $i .fastq.gz`; echo $k; curl http://`curl "https://www.ebi.ac.uk/ena/portal/ap
i/filereport?accession=$k&result=read_run&fields=run_accession,fastq_ftp,fastq_md5,fastq_bytes"|tail -n 1|cut -f 2|cut -f 1 -d";"`|zcat|head -n 40000|gzip -c > $
i; fi; done 2>&1 | tee $j.ebi.log

改良版

j=$1

for i in `cat $j`; do
 n=`zcat $j.fastq/$i.fastq.gz|wc -l`;
 if [ $n = 0 ]; then
  echo $i;
  /suikou/tool/sratoolkit.2.10.8-centos_linux64/bin/fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print "+"; print old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; oldstr=str; old1=$1; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n 40000|gzip -c > $j.fastq/$i.fastq.gz;
 fi
done 2>&1 | tee -a $j.log

for i in `cat $j`; do
 n=`zcat $j.fastq/$i.fastq.gz|wc -l`;
 if [ $n = 0 ]; then
  echo $i;
  curl http://`curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=run_accession,fastq_ftp,fastq_md5,fastq_bytes"|tail -n 1|cut -f 2|cut -f 1 -d";"`|zcat|head -n 40000|gzip -c > $j.fastq/$i.fastq.gz;
 fi;
done 2>&1 | tee -a $j.ebi.log

for i in SraId_??; do echo bash ./run-download.sh $i; done|xargs -I{} -P 7 bash -c "{}"
  • sraデータダウンロード_メタデータダウンロード.1713184766.txt.gz
  • 最終更新: 2024/04/15 12:39
  • by suikou