**文書の過去の版を表示しています。**
SRAメタデータダウンロード 2024/03/13
const fs = require('fs'); const zlib = require('zlib'); const tar = require('tar'); const xml2js = require('xml2js'); // tar.gzファイルのパス // 最初のコマンドライン引数を取得 const firstArg = process.argv[2]; const tarGzFilePath = firstArg //'NCBI_SRA_Metadata_Full_20240219.tar.gz'; // gunzipストリームを作成 const gunzipStream = fs.createReadStream(tarGzFilePath).pipe(zlib.createGunzip()); // tarストリームを作成 const extractStream = gunzipStream.pipe(new tar.Parse()); //出力結果保存用 let res_srr = [] let res_srx = [] let res_srp = [] let res_srs = [] function removeFile(filePath){ if (fs.existsSync(filePath)) { fs.unlinkSync(filePath); console.log(filePath + 'が削除されました。'); } else { console.log(filePath + 'が存在しません。'); } } removeFile("output_srr.txt") removeFile("output_srx.txt") removeFile("output_srp.txt") removeFile("output_srs.txt") // ファイルが解凍されるたびにイベントを発生 extractStream.on('entry', (entry) => { // XMLファイルのみ処理 if (entry.path.endsWith('.xml')) { let xmlData = ''; // XMLデータを取得 entry.on('data', (chunk) => { xmlData += chunk.toString(); }); // XMLファイルの処理が完了したら entry.on('end', () => { // XMLをJSONに変換 xml2js.parseString(xmlData, (err, result) => { if (err) { console.error('Error parsing XML:', err); } else { try{ //console.log('JSON data:', JSON.stringify(result, null, 2)); if("RUN_SET" in result){ for(let srr of result["RUN_SET"]["RUN"]){ //console.log("srr: ", srr) let idsrr = "" let idsrx = "" if("IDENTIFIERS" in srr && "PRIMARY_ID" in srr["IDENTIFIERS"][0]){ idsrr = srr["IDENTIFIERS"][0]["PRIMARY_ID"][0] } if("EXPERIMENT_REF" in srr && "IDENTIFIERS" in srr["EXPERIMENT_REF"][0] && "PRIMARY_ID" in srr["EXPERIMENT_REF"][0]["IDENTIFIERS"][0]){ idsrx = srr["EXPERIMENT_REF"][0]["IDENTIFIERS"][0]["PRIMARY_ID"][0] }else if("EXPERIMENT_REF" in srr && "$" in srr["EXPERIMENT_REF"][0] && "accession" in srr["EXPERIMENT_REF"][0]["$"]){ idsrx = srr["EXPERIMENT_REF"][0]["$"]["accession"] } let res = idsrr+"\t"+idsrx console.log(res) res_srr.push(res) if(res_srr.length>=10000){writefile("output_srr.txt",res_srr); res_srr=[]} } }else if("STUDY_SET" in result){ for(let srp of result["STUDY_SET"]["STUDY"]){ let idsrp = "" let srptitle = "" let srpabstract = "" let srpdesc = "" let srptype = "" if("IDENTIFIERS" in srp && "PRIMARY_ID" in srp["IDENTIFIERS"][0]){ idsrp = srp["IDENTIFIERS"][0]["PRIMARY_ID"][0] } if("DESCRIPTOR" in srp && "STUDY_TITLE" in srp["DESCRIPTOR"][0]){ srptitle = srp["DESCRIPTOR"][0]["STUDY_TITLE"][0].replace(/\t/g, " "); } if("DESCRIPTOR" in srp && "STUDY_DESCRIPTION" in srp["DESCRIPTOR"][0]){ srpdesc = srp["DESCRIPTOR"][0]["STUDY_DESCRIPTION"][0].replace(/\t/g, " "); } if("DESCRIPTOR" in srp && "STUDY_TYPE" in srp["DESCRIPTOR"][0] && "$" in srp["DESCRIPTOR"][0]["STUDY_TYPE"][0] && "existing_study_type" in srp["DESCRIPTOR"][0]["STUDY_TYPE"][0]["$"]){ srptype = srp["DESCRIPTOR"][0]["STUDY_TYPE"][0]["$"]["existing_study_type"] } if("DESCRIPTOR" in srp && "STUDY_ABSTRACT" in srp["DESCRIPTOR"][0]){ srpabstract = srp["DESCRIPTOR"][0]["STUDY_ABSTRACT"][0].replace(/\t/g, " "); } let res = idsrp+"\t"+srptitle+"\t"+srpabstract+"\t"+srpdesc+"\t"+srptype console.log(res) res_srp.push(res) if(res_srp.length>=10000){writefile("output_srp.txt",res_srp); res_srp=[]} } }else if("SAMPLE_SET" in result){ //console.log(JSON.stringify(result, null, 2)) for(let srs of result["SAMPLE_SET"]["SAMPLE"]){ let idsrs = "" let srsname = "" let srsdesc = "" let srstitle = "" let srsdate = "" let srslatlon = "" if("IDENTIFIERS" in srs && "PRIMARY_ID" in srs["IDENTIFIERS"][0]){ idsrs = srs["IDENTIFIERS"][0]["PRIMARY_ID"][0] } if("SAMPLE_NAME" in srs && "SCIENTIFIC_NAME" in srs["SAMPLE_NAME"][0]){ srsname = srs["SAMPLE_NAME"][0]["SCIENTIFIC_NAME"][0] } if("DESCRIPTION" in srs){ srsdesc = srs["DESCRIPTION"][0].replace(/\t/g, " "); } if("TITLE" in srs){ srstitle = srs["TITLE"][0].replace(/\t/g, " "); } if("SAMPLE_ATTRIBUTES" in srs && "SAMPLE_ATTRIBUTE" in srs["SAMPLE_ATTRIBUTES"][0]){ srs["SAMPLE_ATTRIBUTES"][0]["SAMPLE_ATTRIBUTE"].forEach(att => { if(att["TAG"][0]==="collection_date"){ srsdate = att["VALUE"][0] }else if(att["TAG"][0]==="lat_lon"){ srslatlon = att["VALUE"][0] } }) } let res = idsrs+"\t"+srsname+"\t"+srsdesc+"\t"+srstitle+"\t"+srsdate+"\t"+srslatlon console.log(res) res_srs.push(res) if(res_srs.length>=10000){writefile("output_srs.txt",res_srs); res_srs=[]} } }else if("EXPERIMENT_SET" in result){ //console.log(JSON.stringify(result, null, 2)) for(let srx of result["EXPERIMENT_SET"]["EXPERIMENT"]){ let idsrx = "" let idsrp = "" let idsrs = "" let srxtitle = "" let srxplat = "" let srxplat2 = "" if("IDENTIFIERS" in srx && "PRIMARY_ID" in srx["IDENTIFIERS"][0]){ idsrx = srx["IDENTIFIERS"][0]["PRIMARY_ID"][0] } if("TITLE" in srx){ srxtitle = srx["TITLE"][0] } if("STUDY_REF" in srx && "IDENTIFIERS" in srx["STUDY_REF"][0] && "PRIMARY_ID" in srx["STUDY_REF"][0]["IDENTIFIERS"][0]){ idsrp = srx["STUDY_REF"][0]["IDENTIFIERS"][0]["PRIMARY_ID"][0] }else if("STUDY_REF" in srx && "$" in srx["STUDY_REF"][0] && "accession" in srx["STUDY_REF"][0]["$"]){ idsrp = srx["STUDY_REF"][0]["$"]["accession"] } if("PLATFORM" in srx){ srxplat = Object.keys(srx["PLATFORM"][0])[0] if("INSTRUMENT_MODEL" in srx["PLATFORM"][0][srxplat][0]){ srxplat2 = srx["PLATFORM"][0][srxplat][0]["INSTRUMENT_MODEL"][0] } } if("DESIGN" in srx){ for(let srs of srx["DESIGN"]){ if("SAMPLE_DESCRIPTOR" in srs && "IDENTIFIERS" in srs["SAMPLE_DESCRIPTOR"][0] && "PRIMARY_ID" in srs["SAMPLE_DESCRIPTOR"][0]["IDENTIFIERS"][0]){ idsrs = srs["SAMPLE_DESCRIPTOR"][0]["IDENTIFIERS"][0]["PRIMARY_ID"][0] }else if("SAMPLE_DESCRIPTOR" in srs && "$" in srs["SAMPLE_DESCRIPTOR"][0] && "accession" in srs["SAMPLE_DESCRIPTOR"][0]["$"]){ idsrs = srs["SAMPLE_DESCRIPTOR"][0]["$"]["accession"] } //基本的にSRX1つに対してSRSは一つみたいだからもっとループの外で読んでも良いかも let res = idsrx+"\t"+idsrp+"\t"+idsrs+"\t"+srxtitle+"\t"+srxplat+"\t"+srxplat2 console.log(res) res_srx.push(res) if(res_srx.length>=10000){writefile("output_srx.txt",res_srx); res_srx=[]} } } } } }catch(e){ console.error(e) console.error(JSON.stringify(result, null, 2)) } } }); }); } else { // XMLファイル以外は無視 entry.resume(); } }); // エラー処理 extractStream.on('error', (err) => { console.error('Error extracting file:', err); }); // 終了時処理 extractStream.on('end', () => { console.log('Finished'); writefile("output_srr.txt", res_srr) writefile("output_srx.txt", res_srx) writefile("output_srp.txt", res_srp) writefile("output_srs.txt", res_srs) }); function writefile(filePath, content){ try { fs.appendFileSync(filePath, content.join("\n")+"\n"); console.log(filePath+'が正常に書き込まれました。'); } catch (err) { console.error(filePath+'の書き込み中にエラーが発生しました:', err); } }
cd ~/work3/metasearch/2023/sra-20240219/ file=`curl https://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/|grep NCBI_SRA_Metadata_Full_|tail -n 1|sed 's%</a>.*%%; s/.*>//'` # NCBI_SRA_Metadata_Full_20240219.tar.gz wget https://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/$file npm install zlib tar xml2js node --max-old-space-size=204800 run3.js $file #200GB しかしこうやってもメモリー30GB使う前に「FATAL ERROR: Scavenger: semi-space copy Allocation failed - JavaScript heap out of memory」で落ちたのでスクリプトを修正して分割出力するようにした。 #メモリー50GBくらい使う awk -F'\t' 'FILENAME==ARGV[1]{str=$2; for(i=3;i<=NF;i++){str=str"###"$i}; srs[$1]=str} FILENAME==ARGV[2]{str=$2; for(i=3;i<=NF;i++){str=str"###"$i}; srp[$1]=str} FILENAME==ARGV[3]{str=$2; for(i=3;i<=NF;i++){str=str"###"$i}; srx[$1]=str; srx2srp[$1]=$2; srx2srs[$1]=$3} FILENAME==ARGV[4]{srxid=$2; print $0"\t"srx[srxid]"\t"srs[srx2srs[srxid]]"\t"srp[srx2srp[srxid]]}' output_srs.txt output_srp.txt output_srx.txt output_srr.txt > output.txt ssh ddbj qlogin cp /usr/local/resources/dra/meta/list/sralist sralist.2024-03-13 exit exit scp ddbj:sralist.2024-03-13 . cat output.txt| awk -F'\t' '{split($4,arr,"###"); split(arr[5],arr2," "); if(length(arr2)==4){split($3,arr,"###"); print $0}}'|awk -F'\t' 'FILENAME==ARGV[1]{dra[$7]=$0} FILENAME==ARGV[2]&& $1 in dra{print dra[$1]"\t"$0}' sralist.2024-03-13 /dev/stdin > sralist.2024-03-13.inddbj-with-latlon cat output.txt |grep -i metagenom|awk -F'\t' 'FILENAME==ARGV[1]{dra[$7]=$0} FILENAME==ARGV[2]&& $1 in dra{print dra[$1]"\t"$0}' sralist.2024-03-13 /dev/stdin > sralist.2024-03-13.inddbj-metagenom for i in db3 db3-2 db3-3 db4 db4-2; do ls ../$i|grep "^[1-9][0-9]*$"|while read j; do ls ../$i/$j|grep "[.]fasta[.]gz$"|sed 's/[.]fasta[.]gz$//; s/[.]lite$//'; done; done > downloaded_fasta_list.db3-db4.txt more downloaded_fasta_list.db3-db4.txt|sort|uniq > downloaded_fasta_list.db3-db4.sort.txt awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($7 in a){print $0}' downloaded_fasta_list.db3-db4.sort.txt sralist.2024-03-13.inddbj-with-latlon > sralist.2024-03-13.inddbj-with-latlon.new awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($7 in a){print $0}' downloaded_fasta_list.db3-db4.sort.txt sralist.2024-03-13.inddbj-metagenom > sralist.2024-03-13.inddbj-metagenom.new #mitosearchでの表示用 cat output.txt| awk -F'\t' '{split($4,arr,"###"); split(arr[5],arr2," "); if(length(arr2)==4){print $0}}'|awk -F'\t' 'FILENAME==ARGV[1]{drr[$1]=1} FILENAME==ARGV[2]&& $1 in drr{print $0}' downloaded_fasta_list.db3-db4.sort.txt /dev/stdin > downloaded_fasta_list.db3-db4.sort.txt.with-latlon mkdir ../test2 awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME!=ARGV[1]{if(FNR==1){print FILENAME}; if($0~"^id"){if($2 in a){id=$2}else{id=""}}; if(id!=""){print $0 > "../test2/"FILENAME}}' downloaded_fasta_list.db3-db4.sort.txt.with-latlon *.tsv cd ../test2 for i in *.tsv; do mkdir -p ../db_fish_ja/$i; done for i in *.tsv; do awk -F'\t' '{if(FNR==1){print FILENAME}; if($0~"^id"){id=$2}; if(id!=""){print $0 > "../db_fish_ja/"FILENAME"/"id".input"}}' $i; done #metasearch用 cat sralist.2024-03-13.inddbj-metagenom.new |sed 's%/ddbj_database%/usr/local/resources%'|awk -F'\t' '{print $7"\t"$1}' > sralist.new.txt pigz sralist.new.txt scp sralist.new.txt.gz ddbj:
ssh ddbj qlogin mkdir db5 mv sralist.new.txt.gz cd db5 #5000リードずつ抽出する ln -s ~/db4/run-get-reads.sh . zcat sralist.new.txt.gz | awk -F'\t' -v block=5000 '{n=int((NR-1)/block)+1; print n"\t"$1"\t"$2 > "split."n}' mkdir -p log for i in split.*; do qsub -j y -o log/getreads.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done
SRRダウンロード 2024/01/18
metagenomeのSRX一覧をダウンロードする。
https://www.ncbi.nlm.nih.gov/sra/?term=metagenome
を開いて、Send to → File → Summaryをダウンロードする。本当はRunInfo形式が良いのだけど、それだと10万件しかダウンロードできない。
ダウンロードしたsra_result_2024-01-19.csv
をDDBJスパコンに転送し、SRRの場所を抜き出す。
more sra_result_2024-01-19.csv |cut -f 1 -d,|tail -n+2|sed 's/^"//; s/"$//'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1; n++} FILENAME==ARGV[2] && $8 in a{print $0; m++} END{print n,m,m/n*100 > "sralist.info"}' /dev/stdin /usr/local/resources/dra/meta/list/sralist > sralist.txt #4127550件中3168212件ファイルあり76.7577%
しかし、WEBブラウザ経由でダウンロードしたsra_result.csvは途中で止まっていることもあるみたい。何度かダウンロードすると全部ダウンロードできそう。 もしくはコマンドラインでSRRリストを取得するには
docker run -t --rm ncbi/blast:2.13.0 bash -c "esearch -db sra -query metagenome|efetch -format runinfo"|cut -f 1 -d,|tail -n+2 > ~/work3/metasearch/2023/db4/sra_runinfo.txt cat sra_runinfo.txt |cut -f 1 -d,|tail -n+2|sed 's/^"//; s/"$//'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1; n++} FILENAME==ARGV[2] && $7 in a{print $0; m++} END{print n,m,m/n*100 > 'sralist.info'}' /dev/stdin /usr/local/resources/dra/meta/list/sralist > sralist.txt #metasearchでサンプル情報を取得する用のデータ #/usr/local/resources/dra/meta/list/sralist -> sralist.2024-02-20 more sra_result_2024_01-19.csv|awk -F'","' '{print $1"\t"$2"\t"$3"\t"$7}'|sed 's/^"//' > sra_result_2024_01-19.csv.tsv awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&$8 in a{b[$8]=$7} FILENAME==ARGV[3]{OFS="\t"; $1=$1"\t"b[$1]; print $0}' sra_result_2024_01-19.csv.tsv sralist.2024-02-20 ./sra_result_2024_01-19.csv.tsv > ./sra_result_2024_01-19.csv.tsv2 more ./sra_result_2024_01-19.csv.tsv2|awk -F'\t' '$2!=""' > sra_result_2024_01-19.csv.tsv3
追加すべきデータを抽出
cat sralist.txt |sed 's%/ddbj_database%/usr/local/resources%'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($1 in a){print $0}' ~/meta/sra_result.csv.run.ddbj2 /dev/stdin > sralist.new.txt cat sralist.new.txt|cut -f 1|xargs ls > sralist.new.exist.txt cat ~/meta/sra_result.csv.run.ddbj2 sralist.new.exist.txt > sralist.alltrial.txt cat sralist.txt |sed 's%/ddbj_database%/usr/local/resources%'|awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2] && $1 in a{print $0}' sralist.new.exist.txt /dev/stdin > sralist.new.exist.full.txt
300リードずつ抽出する
awk -F'\t' -v block=5000 '{n=int((NR-1)/block)+1; print n"\t"$7"\t"$1 > "split."n}' sralist.new.exist.full.txt mkdir -p log for i in split.*; do qsub -j y -o log/getreads.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done
i=$2 #SRR ID dir=$1 #1 output dir path=$3 #/.sra n=$4 #300 output reads reads_down=300000 #Nを除去した後で最小リード長以上のリードを何リードランダム抽出に回すか。ペアエンドの場合は1ペアリード=1リードと計算 reads_max_down=`expr $reads_down '*' 40` #とりあえずダウンロードするリード数。 min_len=30 j=$i k=$path #`grep $j$'\t' sralist.dir.txt|cut -f 3` if [ "$k" != "" ]; then echo $k mkdir -p $dir if [ `file $k|grep gzip|wc -l` = 1 ]; then echo bam #DDBJのERRの多くは拡張子.sraなのにunmapped bam形式みたい?しかもペアエンドじゃない時?にリードが順向き逆向きの2回出てくる。リード名が重複してもこのスクリプトだと重複して最終的に出てくる。 samtools fastq $k|awk '{if(NR%4==1){if($1!~"/1$"&&$1!~"/2$"){$1=$1"/1"}}; print $0}'|paste - - - -|head -n $reads_max_down > /tmp/$j else echo sra fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $k|paste - - - -|head -n $reads_max_down > /tmp/$j fi cat /tmp/$j |awk -F'\t' -v reads_down=$reads_down -v min_len=$min_len -v seed="${RANDOM}" ' $2~"[0-3]"{exit} #for solid sequencer BEGIN{srand(seed)} { #@SRR052204.4/4 などと/1~/4の場合がありうる str=$2; gsub(/[^ACGTacgt]/,"",str); if(length(str)>=min_len){ split($1,arr," "); split(arr[1],arr2,"/"); if(arr2[1]!=old){ old=arr2[1] n++ if(n>1){ print rand()"\t"old"\t"r1"\t"r2 } r1="" r2="" if(n>reads_down){exit} } if(length($2)>length(r1)){r2=r1; r1=$2} else if(length($2)>length(r2)){r2=$2} } } END{print rand()"\t"old"\t"r1"\t"r2} ' > /tmp/$j.2 sort --parallel=1 --buffer-size=1G /tmp/$j.2|head -n $n|cut -f 2-|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2"; print $3}'| gzip -c > $dir/$j.fasta.gz; rm /tmp/$j /tmp/$j.2 else echo No sra fi
失敗探し
#失敗したジョブを探す for i in getreads.o*; do if [ `tail -n 1 $i|grep "CMD_FIN_STATUS: 0"|wc -l` = 0 ]; then echo $i; fi; done | tee error.log #アウトプットがないファイルを探す find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($2 in a){print $0}' /dev/stdin <(cat split.*) > error.job mkdir rerun awk -F'\t' -v block=100 '{n=int((NR-1)/block)+1; print $0 > "rerun/split."n}' error.job for i in rerun/split.*; do qsub -j y -o log/getreads2.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done
もう一度
#アウトプットがないファイルを探す find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]&&!($2 in a){print $0}' /dev/stdin <(cat split.*) > error2.job mkdir rerun2 awk -F'\t' -v block=1 '{n=int((NR-1)/block)+1; print $0 > "rerun2/split."n}' error2.job for i in rerun2/split.*; do qsub -j y -o log/getreads3.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i 300; done'; done
転送用に圧縮
mkdir -p log for i in $(seq 1 `ls split.*|wc -l`); do qsub -j y -o log/tarreads.$i -N tarreads ~/qsub2GB.sh tar cvf $i.tar $i; done
m32sで
cd ~/work3/metasearch/2023/db4 rsync -av --progress ddbj:db4/*.tar . for i in [1-9]*.tar; do tar vxf $i j=`basename $i .tar` cd $j; ls *.fasta.gz|while read i2; do srr=`basename $i2 .fasta.gz`; qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr"; done; cd ..; done
srr=$1 dir=$HOME/work3/metasearch/2023/db db=/suikou/files/m768-data2/2023jissyu/nanopore/db/mergedDB.maskadaptors.fa bitscore=100 top=0.99 set -eux set -o pipefail zcat $srr.fasta.gz | blastn -db $db -query /dev/stdin -num_threads 4 -outfmt 6 -max_target_seqs 500 | awk -F'\t' '{split($1,arr,"/"); if(arr[1]!=old){for(hit in data){temp[hit]=data[hit]["1"]+data[hit]["2"]}; PROCINFO["sorted_in"]="@val_num_desc"; for(hit in temp){print old"\t"hit"\t"temp[hit]}; old=arr[1]; delete data; delete temp}; if(data[$2][arr[2]]==0){data[$2][arr[2]]=$12}}' | awk -F'\t' '$3>'$bitscore'{if(a[$1]==1){if($3>=topbit*'$top'){print $0}}else{a[$1]=1; topbit=$3; print $0}}' | awk -F'\t' 'FILENAME==ARGV[1]{name[$1]=$2} FILENAME==ARGV[2]{print name[$2]"\t"$0}' $db.path /dev/stdin | awk -F'\t' ' function searchLCA(data, i, j, res, res2, str, n, stopflag){ for(i in data){ if(n==0){n=split(i,res,";")} else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}} } if(res[1]!=""){str=res[1]} else{ #i: taxonomy path #葉緑体と植物の18Sは相同性が高いみたいなのでそれが混ざるときは葉緑体を優先させる chloroplast=0 delete datachloro for(i in data){ if(i~"^Bacteria;Cyanobacteria;Cyanobacteriia;Chloroplast;"){chloroplast++; datachloro[i]=1} } if(chloroplast>0){ n2=0 for(i in datachloro){ if(n2==0){n2=split(i,res,";")} else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}} } } } if(res[1]!=""){str=res[1]} else{ str="unknown"; stopflag=1 }; for(i=2;i<=n;i++){if(stopflag==0 && res[i]!=""){str=str";"res[i]}else{stopflag=1}} return str; } { if($2!=old){if(old!=""){print searchLCA(data)"\t"oldstr}; delete data; data[$1]=1; old=$2; oldstr=$0} else{data[$1]=1} } END{if(length(data)>0){print searchLCA(data)"\t"oldstr}} ' > $srr.txt awk -F'\t' '{cnt[$1]++} END{PROCINFO["sorted_in"]="@val_num_desc"; for(i in cnt){print i"\t"cnt[i]}}' $srr.txt > $srr.result
エラー調査
for k in *.tar; do k=`basename $k .tar`; echo $k; ls $k|grep gz$|while read i; do j=`basename $i .fasta.gz`; if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $k/$i; fi; done; done > check.txt
もしエラーがあれば
grep gz check.txt|while read i; do dir=`dirname $i` srr=`basename $i .fasta.gz` cd $dir qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-2.sh $srr"; cd .. done
ヒットしたリードが少ないサンプルを調査
find [1-9]*|grep txt$|xargs -I{} -P 6 bash -c "bash run-check-hit.sh {}" > check-reads.txt
#run-check-hit.sh file=$1 #1000/SRR16966126.txt n1=300 n_min=200 n_max=100000 n_req=300 id=`basename $file .txt` dir=`dirname $file` l=`cat $file|wc -l` if [ $l -lt $n_min ]; then if [ `zcat $dir/$id.fasta.gz |grep "^>"|sed 's%/.*%%' | wc -l` = $n1 ]; then if [ $l = 0 ];then k=$n_max else k=`awk 'BEGIN {print int('$n_req'/'$l'*'$n1')}'` fi echo $file$'\t'$k fi fi
FASTQ大きめに取得
DDBJスパコンで
more /usr/local/resources/dra/meta/list/sralist |awk -F'\t' '{print $7"\t"$1}'|sed 's%ddbj_database%usr/local/resources%' > sralist.txt sort sralist.txt > sralist.sorted.txt
split -l 200 check-reads.txt split-check-reads_ for i in split-check-reads_*; do qsub -j y -N getreads ./qsub.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i; done'; done
#run-get-reads.sh i=$1 n=$2 reads_down=300000 #Nを除去した後で最小リード長以上のリードを何リードランダム抽出に回すか。ペアエンドの場合は1ペアリード=1リードと計算 reads_max_down=`expr $reads_down '*' 40` #とりあえずダウンロードするリード数。 min_len=30 j=`echo "$i"|cut -f 2 -d/|cut -f 1 -d.` dir=`echo "$i"|cut -f 1 -d/` #k=`look $j sralist.sorted.txt|cut -f 2` #lookは2GBだとメモリー不足みたい k=`grep $j$'\t' sralist.sorted.txt|cut -f 2` if [ "$k" != "" ]; then echo $k mkdir -p $dir if [ `file $k|grep gzip|wc -l` = 1 ]; then echo bam #DDBJのERRの多くは拡張子.sraなのにunmapped bam形式みたい?しかもペアエンドじゃない時?にリードが順向き逆向きの2回出てくる。リード名が重複してもこのスクリプトだと重複して最終的に出てくる。 samtools fastq $k|awk '{if(NR%4==1){if($1!~"/1$"&&$1!~"/2$"){$1=$1"/1"}}; print $0}'|paste - - - -|head -n $reads_max_down > /tmp/$j else echo sra fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $k|paste - - - -|head -n $reads_max_down > /tmp/$j fi cat /tmp/$j |awk -F'\t' -v reads_down=$reads_down -v min_len=$min_len -v seed="${RANDOM}" ' $2~"[0-3]"{exit} #for solid sequencer BEGIN{srand(seed)} { #@SRR052204.4/4 などと/1~/4の場合がありうる str=$2; gsub(/[^ACGTacgt]/,"",str); if(length(str)>=min_len){ split($1,arr," "); split(arr[1],arr2,"/"); if(arr2[1]!=old){ old=arr2[1] n++ if(n>1){ print rand()"\t"old"\t"r1"\t"r2 } r1="" r2="" if(n>reads_down){exit} } if(length($2)>length(r1)){r2=r1; r1=$2} else if(length($2)>length(r2)){r2=$2} reads[arr2[1]][arr2[2]]=$2; } } END{print rand()"\t"old"\t"r1"\t"r2} ' > /tmp/$j.2 sort --parallel=1 --buffer-size=1G /tmp/$j.2|head -n $n|cut -f 2-|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2"; print $3}'| gzip -c > $dir/$j.fasta.gz; rm /tmp/$j /tmp/$j.2 else echo No sra fi
失敗探し
#失敗したジョブを探す for i in getreads.o*; do if [ `tail -n 1 $i|grep "CMD_FIN_STATUS: 0"|wc -l` = 0 ]; then echo $i; fi; done | tee error.log #アウトプットがないファイルを探す find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]{split($1,arr,"/"); split(arr[2],arr2,"."); if(!(arr2[1] in a)){print $0}}' /dev/stdin <(cat split-check-reads_*) > error.job mkdir rerun awk -F'\t' -v block=100 '{n=int((NR-1)/block)+1; print $0 > "rerun/split."n}' error.job for i in rerun/split.*; do qsub -j y -o log/getreads2.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i; done'; done
もう一度
#アウトプットがないファイルを探す find . |grep gz$|sed 's%./[^/]*/%%; s/.fasta.gz//'|awk 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2]{split($1,arr,"/"); split(arr[2],arr2,"."); if(!(arr2[1] in a)){print $0}}' /dev/stdin <(cat rerun/split.*) > error2.job mkdir rerun2 awk -F'\t' -v block=1 '{n=int((NR-1)/block)+1; print $0 > "rerun2/split."n}' error2.job for i in rerun2/split.*; do qsub -j y -o log/getreads3.$i -N getreads ~/qsub2GB.sh 'cat '$i' | while read i; do echo $i; bash run-get-reads.sh $i; done'; done
m32sで
for j in [1-9]*; do cd $j; ls *.fasta.gz|while read i2; do srr=`basename $i2 .fasta.gz`; qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr"; done; cd ..; done
エラー調査
for k in [1-9]*; do echo $k; ls $k|grep gz$|while read i; do j=`basename $i .fasta.gz`; if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $k/$i; fi; done; done | tee check.txt
grep gz check.txt|while read i; do dir=`dirname $i` srr=`basename $i .fasta.gz` cd $dir qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-4.sh $srr"; cd .. done
ACCESSION->DEFINITION取得
#cd work3/metasearch/2023/acc2desc curl https://ftp.ncbi.nih.gov/genbank/|grep seq.gz|sed 's/<a[^>]*>//; s/<.*//' > seq.list cat seq.list |while read i; do curl https://ftp.ncbi.nih.gov/genbank/$i|gzip -dc|awk '$0~"^LOCUS "{if(acc!=""){print acc"\t"des}; acc=""; des=""} $0~"^ACCESSION "{acc=$2} f==1&&substr($0,1,1)!=" "{f=0} $0~"^DEFINITION "{f=1; des=""} f==1{des=des" "$0} END{if(acc!=""){print acc"\t"des}}'|sed 's/ \+/ /g; s/\t DEFINITION/\t/'| gzip -c > $i; done
10MB/s出るなら、10日くらいでは終わりそうだけど、DDBJにミラーがあったので、そっちを利用
cd acc2desc/ dir=/usr/local/resources/mirror/ftp.ncbi.nih.gov/genbank/genbank.20221109080028916/; ls $dir|grep seq.gz$|while read i; do zcat $dir/$i|awk '$0~"^LOCUS "{if(acc!=""){print acc"\t"des}; acc=""; des=""} $0~"^ACCESSION "{acc=$2} f==1&&substr($0,1,1)!=" "{f=0} $0~"^DEFINITION "{f=1; des=""} f==1{des=des" "$0} END{if(acc!=""){print acc"\t"des}}'|sed 's/ \+/ /g; s/\t DEFINITION/\t/' | gzip -c > $i; done
dir=/usr/local/resources/mirror/ftp.ncbi.nih.gov/genbank/genbank.20221109080028916/ ls $dir|grep seq.gz$|while read i; do qsub -N $i run-acc2desc.sh $i; done #5000個のqsub上限を超えるので分けたほうが無難
#!/bin/bash #$ -S /bin/bash #$ -cwd #$ -l d_rt=01:00:00 #$ -l s_rt=01:00:00 #$ -pe def_slot 1 #$ -l mem_req=2G,s_vmem=2G #$ -j y i=$1 dir=/usr/local/resources/mirror/ftp.ncbi.nih.gov/genbank/genbank.20221109080028916/ zcat $dir/$i|awk ' $0~"^LOCUS "{if(acc!=""){print acc"\t"des}; acc=""; des=""} $0~"^ACCESSION "{acc=$2} f==1&&substr($0,1,1)!=" "{f=0} $0~"^DEFINITION "{f=1; des=""} f==1{des=des" "$0} END{if(acc!=""){print acc"\t"des}}'|sed 's/ \+/ /g; s/\t DEFINITION/\t/' | gzip -c > $i
#cd work3/metasearch/2023/db find ../acc2desc/|grep gz$|xargs cat > acc2desc.gz zcat acc2desc.gz | awk -F'\t' 'FILENAME==ARGV[1]{a[$1]=1} FILENAME==ARGV[2] && $1 in a{print $0}' ~/work3/metasearch/2023/db/create_blast_db_tempdir/mergedDB.fa.maskadaptors.nucl_gb.accession2taxid /dev/stdin > acc2desc.maskadaptors sed 's/;/,/g' acc2desc.maskadaptors > acc2desc.maskadaptors2
バクテリアは基本SILVAのみを使用したDBを使用 2023/11/29
ペアエンドのbit scoreを合算して使用
ln -s ../db/??.tar . for i in ??.tar; do j=`basename $i .tar`; if [ ! -e $j ]; then tar vxf $i; cd $j; ls *.fasta.gz|while read i2; do srr=`basename $i2 .fasta.gz`; qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-2.sh $srr"; done; cd ..; fi; done
srr=$1 dir=$HOME/work3/metasearch/2023/db db=/suikou/files/m768-data2/2023jissyu/nanopore/db/mergedDB.maskadaptors.fa bitscore=100 top=0.99 zcat $srr.fasta.gz | blastn -db $db -query /dev/stdin -num_threads 4 -outfmt 6 -max_target_seqs 500 | awk -F'\t' '{split($1,arr,"/"); if(arr[1]!=old){for(hit in data){temp[hit]=data[hit]["1"]+data[hit]["2"]}; PROCINFO["sorted_in"]="@val_num_desc"; for(hit in temp){print old"\t"hit"\t"temp[hit]}; old=arr[1]; delete data; delete temp}; if(data[$2][arr[2]]==0){data[$2][arr[2]]=$12}}' | awk -F'\t' '$3>'$bitscore'{if(a[$1]==1){if($3>=topbit*'$top'){print $0}}else{a[$1]=1; topbit=$3; print $0}}' | awk -F'\t' 'FILENAME==ARGV[1]{name[$1]=$2} FILENAME==ARGV[2]{print name[$2]"\t"$0}' $db.path /dev/stdin | awk -F'\t' ' function searchLCA(data, i, j, res, res2, str, n, stopflag){ for(i in data){ if(n==0){n=split(i,res,";")} else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}} } if(res[1]!=""){str=res[1]}else{str="unknown"; stopflag=1}; for(i=2;i<=n;i++){if(stopflag==0 && res[i]!=""){str=str";"res[i]}else{stopflag=1}} return str; } { if($2!=old){if(old!=""){print searchLCA(data)"\t"oldstr}; delete data; data[$1]=1; old=$2; oldstr=$0} else{data[$1]=1} } END{if(length(data)>0){print searchLCA(data)"\t"oldstr}} ' > $srr.txt awk -F'\t' '{cnt[$1]++} END{PROCINFO["sorted_in"]="@val_num_desc"; for(i in cnt){print i"\t"cnt[i]}}' $srr.txt > $srr.result
エラー調査
for k in *.tar; do k=`basename $k .tar`; echo $k; ls $k|grep gz$|while read i; do j=`basename $i .fasta.gz`; if [ "`tail -n 1 $k/$j.log`" != "CMD_FIN_STATUS: 0" ]; then echo $k/$i; fi; done; done > check.txt
grep gz check.txt|while read i; do dir=`dirname $i` srr=`basename $i .fasta.gz` cd $dir qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast3-2.sh $srr"; cd .. done
BLASTトップヒットの辞書順で一番小さいヒットを出力 2023/9/12
for i in *.tar; do j=`basename $i .tar`; if [ ! -e $j ]; then tar vxf $i; cd $j; ls *.fasta.gz|while read i2; do srr=`basename $i2 .fasta.gz`; qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast2.sh $srr"; done; cd ..; fi; done
srr=$1 dir=$HOME/work3/metasearch/2023/db zcat $srr.fasta.gz | blastn -db $dir/create_blast_db_tempdir/database.fasta -query /dev/stdin -num_threads 4 -outfmt 6 -max_target_seqs 500 |awk -F'\t' -v min_bit=100 '{split($1,arr,"/"); if(data[arr[1]][$2][arr[2]]==0){data[arr[1]][$2][arr[2]]=$12}} END{for(read in data){bit=0; for(hit in data[read]){temp=data[read][hit]["1"]+data[read][hit]["2"]; if(temp>bit){bit=temp}}; if(bit>=min_bit){delete out; for(hit in data[read]){temp=data[read][hit]["1"]+data[read][hit]["2"]; if(temp==bit){out[hit]=1}}; PROCINFO["sorted_in"]="@ind_str_asc"; for(hit in out){print read"\t"hit"\t"temp; break}}}}'|awk -F'\t' 'FILENAME==ARGV[1]{acc2tax[$1]=$3} FILENAME==ARGV[2]{tax2name[$1]=$2} FILENAME==ARGV[3]{acc2desc[$1]=$2} FILENAME==ARGV[4]{split($2,arr,"."); print $0"\t"tax2name[acc2tax[arr[1]]]";acc:"arr[1]":"acc2desc[arr[1]]}' $dir/create_blast_db_tempdir/mergedDB.fa.maskadaptors.nucl_gb.accession2taxid $dir/create_blast_db_tempdir/mergedDB.fa.maskadaptors.names.dmp $dir/acc2desc.maskadaptors2 /dev/stdin|sed 's/\troot;cellular organisms;/\t/' > $srr.txt awk -F'\t' '{cnt[$4]++} END{PROCINFO["sorted_in"]="@val_num_desc"; for(i in cnt){print i"\t"cnt[i]}}' $srr.txt > $srr.result
SRAダウンロード+blast 2023/8/5
for n in `seq 10 20`; do mkdir $n; cd $n awk -F'\t' -v n=$n 'NR>(n-1)*10000&&NR<=n*10000' ~/oldhome/work2/metagenome-db/2022/sra_result.csv.run|cut -f 2|while read srr; do if [ ! -e $srr.result ]; then echo ../run-download.sh $srr; fi done|xargs -I{} -P5 bash -c "{}" cd .. done
#run-download.sh srr=$1 fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $srr|paste - - - -|awk -F'\t' -v reads_down=100000 -v reads_max_down=reads_down*40 -v min_len=30 'NR>reads_max_down||$2~"[0-3]"{exit} {str=$2; gsub(/[^ACGTacgt]/,"",str); if(length(str)>=min_len){spl it($1,arr," "); split(arr[1],arr2,"/"); reads[arr2[1]][arr2[2]]=$2; n++; if(n>=reads_down){exit}}} END{for(i in reads){print i"\t"reads[i]["1"]"\t"reads[i]["2"]}}'|shuf|head -n 300|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2"; print $3}' | gzip -c > $srr.fasta.gz && qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast.sh $srr"
#run-blast.sh srr=$1 dir=$HOME/work3/metasearch/2023/db zcat $srr.fasta.gz | blastn -db $dir/create_blast_db_tempdir/database.fasta -query /dev/stdin -num_threads 4 -outfmt 6 -max_target_seqs 500 |awk -F'\t' -v percent=0.99 -v min_bit=100 '{split($1,arr,"/"); if(data[arr[1]][$2][arr[2]]==0){data[arr[1]][$2][arr[2]]=$12}} END{ for(read in data){bit=0; for(hit in data[read]){temp=data[read][hit]["1"]+data[read][hit]["2"]; if(temp>bit){bit=temp}}; if(bit>=min_bit){for(hit in data[read]){temp=data[read][hit]["1"]+data[read][hit]["2"]; if(temp>=bit*percent){print read"\t"hit"\t"(data[read][hit]["1 "]+data[read][hit]["2"])}}}}}'|awk -F'\t' 'FILENAME==ARGV[1]{acc2tax[$1]=$3} FILENAME==ARGV[2]{tax2name[$1]=$2} FILENAME==ARGV[3]{split($2,arr,"."); print $0"\t"tax2name[acc2tax[arr[1]]]}' $dir/create_blast_db_tempdir/mergedDB.fa.maskadaptors.nucl_gb.accession2taxid $dir /create_blast_db_tempdir/mergedDB.fa.maskadaptors.names.dmp /dev/stdin |awk -F'\t' '{if($1!=old){if(old!=""){ORS=""; print old"\t"; print items[1]; for(i=2;i<=length(items);i++){if(items[i]==""){break}; print ";"items[i]}; print "\n"}; old=$1; split($4,items,";")}else{sp lit($4,arr,";"); for(i=1;i<=length(items);i++){if(items[i]!=arr[i]){items[i]=""; break}}}} END{ORS=""; print old"\t"; print items[1]; for(i=2;i<=length(items);i++){if(items[i]==""){break}; print ";"items[i]}; print "\n"}'|sed 's/\troot;cellular organisms;/\t/' > $srr.txt cat $srr.txt|awk -F'\t' '{cnt[$2]++} END{for(i in cnt){print i"\t"cnt[i]}}'|sort -k2,2nr -t$'\t' > $srr.result
for DDBJ
more /usr/local/resources/dra/meta/list/sralist |cut -f 1,7|awk -F'\t' '{print $2"\t"$1}'|sed 's%/ddbj_database%/usr/local/resources%'|sort > sralist.forlook awk -F'\t' 'FILENAME==ARGV[1]{x[$2]=1} FILENAME==ARGV[2] && $1 in x{a[$1]=$2} FILENAME==ARGV[3]{print $0"\t"a[$2]}' ./sra_result.csv.run sralist.forlook sra_result.csv.run > sra_result.csv.run.ddbj cat sra_result.csv.run.ddbj|awk -F'\t' '$3!=""{print $3}' > sra_result.csv.run.ddbj2 more sra_result.csv.run.ddbj2 |wc -l|awk '{print int(($0 - 1)/300)+1}' #9388 qsub qsub-make-fasta.sh
# epyc.q & intel.qを使う場合。ただしメモリーが2Gくらいまででないと中々入らなそう。 #!/bin/bash #$ -S /bin/bash #$ -cwd #$ -l d_rt=192:00:00 #$ -l s_rt=192:00:00 #$ -pe def_slot 1 #$ -l mem_req=2G,s_vmem=2G #$ -t 1:9388 source ~/.bashrc n=$SGE_TASK_ID echo $n mkdir -p $n; cat sra_result.csv.run.ddbj2|awk -v n=$n 'NR>(n-1)*300&&NR<=n*300'|while read i; do m=`expr $m + 1`; j=`basename $i .sra`; echo $n $m $i; (if [ `file $i|grep gzip|wc -l` = 1 ]; then samtools fastq $i|awk '{if(NR%4==1){if($1!~"/1$"&&$1!~"/2$"){$1=$1"/1"}}; print $0}' else fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i fi )|paste - - - -| awk -F'\t' -v reads_down=100000 -v reads_max_down=reads_down*40 -v min_len=30 ' NR>reads_max_down||$2~"[0-3]"{exit} {str=$2; gsub(/[^ACGTacgt]/,"",str); if(length(str)>=min_len){split($1,arr," "); split(arr[1],arr2,"/"); reads[arr2[1]][arr2[2]]=$2; n++; if(n>=reads_down){exit}}} END{for(i in reads){print i"\t"reads[i]["1"]"\t"reads[i]["2"]}} '|shuf|head -n 300|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2"; print $3}'| gzip -c > $n/$j.fasta.gz; done tar cvf $n.tar $n rm -rf $n
# qsub-make-fasta-lowmem.sh #!/bin/bash #$ -S /bin/bash #$ -cwd #$ -l d_rt=01:00:00 #$ -l s_rt=01:00:00 #$ -pe def_slot 1 #$ -l mem_req=2G,s_vmem=2G #$ -j y # -t 121,126,127,128,129,130,131,132,133,134,135,136,137,138,18,29,35,49,87,92, #$ -t 141:9388 source ~/.bashrc n=$SGE_TASK_ID echo $n reads_down=100000 reads_max_down=`expr $reads_down '*' 40` min_len=30 mkdir -p $n; cat sra_result.csv.run.ddbj2|awk -v n=$n 'NR>(n-1)*300&&NR<=n*300'|while read i; do m=`expr $m + 1`; j=`basename $i .sra`; echo $n $m $i; if [ `file $i|grep gzip|wc -l` = 1 ]; then samtools fastq $i|awk '{if(NR%4==1){if($1!~"/1$"&&$1!~"/2$"){$1=$1"/1"}}; print $0}'|paste - - - -|head -n $reads_max_down > /tmp/$j else fastq-dump --split-3 --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i|paste - - - -|head -n $reads_max_down > /tmp/$j fi cat /tmp/$j |awk -F'\t' -v reads_down=$reads_down -v min_len=$min_len -v seed="${RANDOM}" ' $2~"[0-3]"{exit} #for solid sequencer {str=$2; gsub(/[^ACGTacgt]/,"",str); if(length(str)>=min_len){split($1,arr," "); split(arr[1],arr2,"/"); reads[arr2[1]][arr2[2]]=$2; n++; if(n>=reads_down){exit}}} END{srand(seed); for(i in reads){print rand()"\t"i"\t"reads[i]["1"]"\t"reads[i]["2"]}} ' > /tmp/$j.2 sort /tmp/$j.2|head -n 300|cut -f 2-|awk -F'\t' '$2!=""{print ">"substr($1,2)"/1"; print $2} $3!=""{print ">"substr($1,2)"/2"; print $3}'| gzip -c > $n/$j.fasta.gz; rm /tmp/$j /tmp/$j.2 done tar cvf $n.tar $n rm -rf $n
# 1時間以内かつメモリー7.5GBくらいまでならshort.qが良さそう # ただし、/usr/local/resorcesなどにはアクセスできないみたい #!/bin/bash #$ -S /bin/bash #$ -cwd #$ -l short #$ -l d_rt=01:00:00 #$ -l s_rt=01:00:00 #$ -pe def_slot 1 #$ -l mem_req=7G,s_vmem=7G #$ -t 107:9388
suikouに戻って
rsync -av --progress ddbj:meta/*.tar . for i in *.tar; do j=`basename $i .tar`; if [ ! -e $j ]; then tar vxf $i; cd $j; ls *.fasta.gz|while read i2; do srr=`basename $i2 .fasta.gz`; qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast.sh $srr"; done; cd ..; fi; done
途中で異常終了した解析をやり直し
for i in `ls *.tar|head -n 4000|tail -n 2000`; do j=`basename $i .tar`; if [ -e $j ]; then cd $j; ls *.fasta.gz|while read i2; do srr=`basename $i2 .fasta.gz`; if [ ! -e $srr.result ]; then rm -f $srr.log qsub -p -1000 -j y -q back.q -l mem_req=0G -pe def_slot_back 4 -o $srr.log -N $srr ~/bin/qsubsh4 "nice -n 19 bash ../run-blast.sh $srr"; fi; done; cd ..; fi; done
DDBJに無いデータ
#at ~/work3/metasearch/2023/db more sra_result.csv.run.ddbj |awk -F'\t' '$3==""' > sra_result.csv.run.ddbj.notinddbj for n in `seq 1 2`; do mkdir n$n; cd n$n awk -F'\t' -v n=$n 'NR>(n-1)*10000&&NR<=n*10000' ../sra_result.csv.run.ddbj.notinddbj|cut -f 2|while read srr; do if [ ! -e $srr.result ]; then echo ../run-download.sh $srr; fi done|xargs -I{} -P5 bash -c "{}" cd .. done
SRAメタデータダウンロード 2022/8/4
#wget https://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/NCBI_SRA_Metadata_20220803.tar.gz <=おそらくこっちは直近1年分?とか wget https://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/NCBI_SRA_Metadata_Full_20220718.tar.gz zcat NCBI_SRA_Metadata_Full_20220718.tar.gz |awk '$1=="<RUN"{a=$0; sub(".*<RUN .*accession=\"","",a); sub("\".*","",a)} $1=="</RUN>"{if(b==1){print a}; a=0; b=0} a!=0&&$1=="<EXPERIMENT_REF"{b=1; sub(".*<EXPERIMENT_REF .*accession=\"","",$0); sub("\".*","",$0); a=a"\t"$0}' > NCBI_SRA_Metadata_Full_20220718.R2X #stringsコマンドを使うとüなどで改行されてしまうので使わない
metagenomeのSRX一覧をダウンロードする。
https://www.ncbi.nlm.nih.gov/sra/?term=metagenome
を開いて、Send to → File → Summaryをダウンロードする。本当はRunInfo形式が良いのだけど、それだと10万件しかダウンロードできない。
cut -f 1 -d , sra_result.csv |tail -n+2|sed 's/"//g'|awk -F'\t' 'FILENAME==ARGV[1]{X2R[$2]=$1} FILENAME==ARGV[2]{print $1"\t"X2R[$1]}' NCBI_SRA_Metadata_Full_20220718.R2X /dev/stdin | awk -F'\t' '$2!=""{print $0; n++} END{print NR"\t"n > "/dev/stderr"}' > sra_result.csv.run #2,853,384件中2,831,286件有効
SRRダウンロード+blast 2022年版
:::::::::::::: run-batch-download.sh :::::::::::::: #!/bin/bash #bash run-batch-download.sh <dir name> <target row num> csv="sra_result.csv.run" numdir="$1" i="$2" readnum=10000 sra=`awk -F'\t' 'NR=='$i'{print $2}' "$csv"`; fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $sra |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print "+"; pr int old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; oldstr=str; old1=$1 ; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n $(($readnum * 4))| awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' > $numdir/$sra.fasta; cd $numdir qsub -p -1000 -j y -l mem_req=0G -o $sra.log -N $sra ../qsubsh2 "nice -n 19 bash ../run-blast.sh $sra.fasta"; :::::::::::::: run-blast.sh :::::::::::::: #!/bin/bash set -x ##USAGE: bash program.sh [input.fasta] wdir="$PWD" cd /tmp shopt -s expand_aliases; alias time='/usr/bin/time -f "real time: %E (%e s)\nsystem time: %S s\nuser time:%U s\nmax memory: %M KB"' rsync -av /suikou/files/m1536/mizobata.hideaki/work/mizowork/blastn_shotgun/db_merged.maskadaptors.fasta* /tmp/; rsync -av /suikou/files/m1536/mizobata.hideaki/work/mizowork/blastn_shotgun/nucl_gb.accession2taxid.gz /tmp/; rsync -av /suikou/files/m1536/mizobata.hideaki/work/mizowork/blastn_shotgun/names.dmp.sname.path.gz /tmp/; rsync -av /suikou/files/m1536/mizobata.hideaki/work/mizowork/blastn_shotgun/blastn_taxname.txt /tmp/; rsync -av /suikou/tool/ncbi-blast-2.12.0+/bin/blastn /tmp/ mv "$wdir"/$1 /tmp/ total=$(expr `cat /tmp/$1|wc -l` / 2) time /tmp/blastn -db /tmp/db_merged.maskadaptors.fasta -query /tmp/$1 -num_threads 2 -outfmt 6 -max_target_seqs 500 > /tmp/$1.blastn time awk -F"\t" -f /tmp/blastn_taxname.txt /tmp/$1.blastn <(zcat /tmp/nucl_gb.accession2taxid.gz) <(zcat /tmp/names.dmp.sname.path.gz) //tmp/$1.blastn | awk -F'\t' -v percent=0.99 ' #検索用の関数 関数内でローカル変数を使用するために、関数内で使用する全変数を引数の後ろ側につけておく。これはawkのいけてない仕様だそう。 function searchLCA(data, i, j, res, res2, str, n, stopflag){ #同じリードのデータを全て検索して共通していないパスを空白に置換しておく。 for(i in data){ if(n==0){n=split(i,res,";")} else{split(i,res2,";"); for(j in res){if(res[j]!=res2[j]){res[j]=""}}} } if(res[1]!=""){str=res[1]}else{str="unknown"; stopflag=1}; for(i=2;i<=n;i++){if(stopflag==0 && res[i]!=""){str=str";"res[i]}else{stopflag=1}} return str; } { #新しいリードに変わったら、いろんな変数を初期化しておく if($1!=old){if($2!=""){if(old!=""){print searchLCA(data)"\t"oldstr}; delete data; basescore=$12; data[$2]=1; old=$1; oldstr=$0}} #前の行と同じリードで、閾値以上(bitscoreがトップヒットの99%以上)であればデータ配列に突っ込む else if($12>=basescore*percent && $2!=""){data[$2]=1} } END{print searchLCA(data)"\t"oldstr}' | awk -F"\t" '{if($4>=95 && $5>=95){print $0}}' | awk -F"\t" -v total=$total ' BEGIN{list["tax"]="num";}{if($1 in list){list[$1]=list[$1]+1;}else{list[$1]=1;}} END{ cnt=0; delete list["tax"]; PROCINFO["sorted_in"]="@val_num_desc"; for(i in list){print list[i]"\t"i; cnt+=list[i]} print total-cnt"\tNo Hit" } ' > $1.summary; mv $1.summary "$wdir" rm /tmp/$1.blastn; rm /tmp/$1 :::::::::::::: run-start-batch.sh :::::::::::::: #!/bin/bash #bash $0 <1万個単位の何番目か> j=$1 beg=$((($j - 1) * 10000 + 1)) fin=$(($j * 10000)) csv=sra_result.csv.run mkdir $j for i in `seq $beg $fin`; do if [ ! $i -gt `cat $csv | wc -l` ]; then echo bash run-batch-download.sh "$j" "$i" fi done|xargs -I{} -P4 bash -c "{}" :::::::::::::: qsubsh2 :::::::::::::: #!/bin/bash #$ -S /bin/bash #$ -cwd #$ -pe def_slot 2 source ~/.bashrc echo SGE_JOB_SPOOL_DIR: "$SGE_JOB_SPOOL_DIR" echo CMD: "$*" eval "$*" echo CMD_FIN_STATUS: $?
cd /home/yoshitake.kazutoshi/work2/metagenome-db/2022 #一度だけ実行しておく ./vdb-config --interactive bash run-start-batch.sh 2 #2,831,286件なので、1-284までを実行 for i in `seq 103 110`; do bash run-start-batch.sh $i; done
#失敗したのを再度実行 #@ work2/metagenome-db/2022 #リスト作成 for i in ? ?? ???; do if [ -e "$i" ]; then cd $i; ls --color=none|awk -v dir=$i '$0~".log$"{split($1,arr,"."); id[arr[1]]++} $0~".fasta.summary"{split($1,arr,"."); id2[arr[1]]++} END{for(i in id){if(!(i in id2)){print dir"\t"i}}}'; cd ..; fi; done > failed.list #失敗したサーバの回数確認 more failed.list |while read i; do dir=`echo $i|awk '{print $1}'`; id=`echo $i|awk '{print $2}'`; head -n 1 $dir/$id.log|cut -f 7 -d /;done|sort |uniq -c|sort -nr #再実行 more failed.list |while read i; do dir=`echo $i|awk '{print $1}'`; id=`echo $i|awk '{print $2}'`; bash run-batch-download_id.sh $dir $id;done
#!/bin/bash #bash run-batch-download.sh <dir name> <target row id> numdir="$1" sra="$2" readnum=10000 fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $sra |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print "+"; pr int old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; oldstr=str; old1=$1 ; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n $(($readnum * 4))| awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' > $numdir/$sra.fasta; cd $numdir rm $sra.log qsub -p -1000 -j y -l mem_req=0G -o $sra.log -N $sra ../qsubsh2 "nice -n 19 bash ../run-blast.sh $sra.fasta";
1万リード以上を取得
# ~/work2/metagenome-db/2022over10k ./run-get-to-download-list.sh ../2022/1/|while read i; do n=`echo "$i"|cut -f 6`; if [ $n -gt 10000 ]; then dir=`echo "$i"|cut -f 1`; id=`echo "$i"|cut -f 2`; echo bash ./run-batch-download_id_over10k.sh $dir $id $n; fi; done|xargs -I{} bash -c {} #まとめて for i in `seq 16 100`; do if [ -e ../2022/$i ]; then ./run-get-to-download-list.sh ../2022/$i/; fi|while read i; do n=`echo "$i"|cut -f 6`; if [ $n -gt 10000 ]; then dir=`echo "$i"|cut -f 1`; id=`echo "$i"|cut -f 2`; echo bash ./run-batch-download_id_over10k.sh $dir $id $n; fi; done|xargs -I{} -P5 bash -c {}; done
dir=$1 find $dir|grep .fasta.summary$|while read i; do awk -F'\t' -v dir=$(basename $dir) -v name=$(basename $i .fasta.summary) '{if($2=="No Hit"){nohit=$1}; cnt+=$1} END{if(cnt>0){p=nohit/cnt}else{p=0}; q=10000; if(cnt==10000&&(cnt - nohit)<1000){if(cnt - nohit == 0){q=1000*10000}else{q=int(1000/(cnt - nohit)*10000)}}; print dir"\t"name"\t"cnt"\t"nohit"\t"p"\t"q}' $i; done
#!/bin/bash #bash run-batch-download.sh <dir name> <target row id> numdir="$1" #91 sra="$2" #SRR13050880 readnum="$3" #10000 mkdir -p $numdir fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $sra |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print "+"; print old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; oldstr=str; old1=$1; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n $(($readnum * 4))| awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' > $numdir/$sra.fasta; cd $numdir rm $sra.log qsub -p -1000 -j y -l mem_req=0G -o $sra.log -N $sra ../qsubsh2 "nice -n 19 bash ../run-blast.sh $sra.fasta";
DDBJからまずはダウンロードするようにしてみると
for i in `seq 152 153`; do if [ -e ../2022/$i ]; then ./run-get-to-download-list.sh ../2022/$i/; fi|while read i; do n=`echo "$i"|cut -f 6`; if [ $n -gt 10000 ]; then dir=`echo "$i"|cut -f 1`; id=`echo "$i"|cut -f 2`; echo bash ./run-batch-download_id_over10k_ddbj.sh $dir $id $n; fi; done|xargs -I{} -P5 bash -c {}; done
#!/bin/bash #bash run-batch-download.sh <dir name> <target row id> numdir="$1" #91 sra="$2" #SRR13050880 readnum="$3" #10000 mkdir -p $numdir baseftpdir=/tmp/a #ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/sralite/ByExp/litesra/SRX/SRX882/SRX8823480/SRR12323344/SRR12323344.sraのlitesraまでをcurlftpfsでマウントしたフォルダ #curlftpfs ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/sralite/ByExp/litesra /tmp/a url=`wget -O /dev/stdout https://ddbj.nig.ac.jp/resource/sra-run/$sra|grep "[.]sra"|grep -o litesra[^.]*[.]sra |sort|uniq|sed 's/litesra\///'|head -n 1` if [ "$url" = "" ];then url=$sra; else url=$baseftpdir/$url;fi fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $url |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print "+"; print old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; old str=str; old1=$1; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n $(($readnum * 4))| awk 'NR % 4 == 1 {print ">" $0 } NR % 4 == 2 {print $0}' > $numdir/$sra.fasta; cd $numdir rm $sra.log qsub -p -1000 -j y -l mem_req=0G -o $sra.log -N $sra /home/yoshitake.kazutoshi/work2/metagenome-db/2022/qsubsh2 "nice -n 19 bash ../run-blast.sh $sra.fasta";
SRAメタデータダウンロード
http://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/
から
http://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/NCBI_SRA_Metadata_Full_20201105.tar.gz #サンプル情報IDからサンプル情報詳細を得るためのファイル
http://ftp-trace.ncbi.nlm.nih.gov/sra/reports/Metadata/SRA_Accessions.tab #SRRのIDとサンプル情報ID、BioProject IDへの関連付けがされている
をダウンロード。tar.gzを普通に解凍するとファイルがおそらく1億個以上出来そうなので、下記のように一つのテキストファイルとして無理やり出力しておく。
zcat NCBI_SRA_Metadata_Full_20201105.tar.gz |strings > NCBI_SRA_Metadata_Full_20201105.tar.strings
SAMPLEとEXPERIMENTの情報だけ抜き出す。
grep -v "^[^< ]" NCBI_SRA_Metadata_Full_20201105.tar.strings| awk ' $0~"^<SAMPLE_SET[ >]"{stage="s1"; print id"\t"title"\t"name; id=""; title=""; name=""} stage=="s1" && $0~"^ <SAMPLE[ >]"{stage="s2"} stage=="s2" && $0~"^ <IDENTIFIERS[ >]"{stage="s3"} stage=="s3" && $0~"^ <PRIMARY_ID[ >]"{sub(/.*<PRIMARY_ID>/,"",$0); sub(/<.*/,"",$0); id=$0} stage=="s3" && $0~"^ <TITLE[ >]"{sub(/.*<TITLE>/,"",$0); sub(/<.*/,"",$0); title=$0} stage=="s3" && $0~"^ <SAMPLE_NAME[ >]"{stage="s3-2"} stage=="s3-2" && $0~"^ <SCIENTIFIC_NAME[ >]"{sub(/.*<SCIENTIFIC_NAME./,"",$0); sub(/<.*/,"",$0); name=$0} $0~"^<EXPERIMENT_SET[ >]"{stage="e1"; print id"\t"title"\t"name; id=""; title=""; name=""} stage=="e1" && $0~"^ <EXPERIMENT[ >]"{stage="e2"} stage=="e2" && $0~"^ <IDENTIFIERS[ >]"{stage="e3"} stage=="e3" && $0~"^ <PRIMARY_ID[ >]"{sub(/.*<PRIMARY_ID>/,"",$0); sub(/<.*/,"",$0); id=$0} stage=="e3" && $0~"^ <TITLE[ >]"{sub(/.*<TITLE>/,"",$0); sub(/<.*/,"",$0); title=$0} stage=="e3" && $0~"^ <DESIGN[ >]"{stage="e3-2"} stage=="e3-2" && $0~"^ <DESIGN_DESCRIPTION[ >]"{sub(/.*<DESIGN_DESCRIPTION./,"",$0); sub(/<.*/,"",$0); name=$0} END{print id"\t"title"\t"name} ' > NCBI_SRA_Metadata_Full_20201105.tar.strings.sam-exp
それと、
https://ftp.ncbi.nlm.nih.gov/bioproject/
から
https://ftp.ncbi.nlm.nih.gov/bioproject/bioproject.xml #BioProject IDからBioProjectの詳細を得るためのファイル。
をダウンロード
SRRのダウンロード先URL一覧をダウンロードする。
https://www.ncbi.nlm.nih.gov/sra/?term=metagenome
を開いて、下記のようにmetagenomeで検索して、結果ファイルをRunInfo形式でダウンロードする。(2022年8月の時点ではこの方法だと10万件しかダウンロードできないのでまとめてダウンロードするのには不向き。Summary形式なら全件ダウンロードできそうだったけど、肝心のRUN情報が取れない。)
SRAデータダウンロード
more SraRunInfo.csv |sed 's/"[^"]*"//g'|awk -F, '$19=="ILLUMINA" || $19=="BGISEQ" || $19=="ION_TORRENT" || $19=="LS454" || $19=="OXFORD_NANOPORE" || $19=="PACBIO_SMRT"{print $1}' > SraRunInfo.csv.fordownload split -l 10000 SraRunInfo.csv.fordownload SraId_ for i in SraId_*; do mkdir $i.fastq; done j=SraId_aa; for i in `cat $j`; do echo $i; /suikou/tool/sratoolkit.2.10.8-centos_linux64/bin/fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i |awk '{if(NR%4==1){if($0~"/1$"){flag=1}else{flag=0}}; if(flag==1){print $0}}'|head -n 40000|gzip -c > $j.fastq/$i.fastq.gz; done 2>&1 | tee $j.log
SRAからのダウンロードに失敗した場合はEBIからダウンロード
j=SraId_al; for i in $j.fastq/*.fastq.gz; do n=`zcat $i|wc -l`; if [ $n = 0 ];then k=`basename $i .fastq.gz`; echo $k; curl http://`curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$k&result=read_run&fields=run_accession,fastq_ftp,fastq_md5,fastq_bytes"|tail -n 1|cut -f 2|cut -f 1 -d";"`|zcat|head -n 40000|gzip -c > $i; fi; done 2>&1 | tee $j.ebi.log
batchファイルで上の2つを連続実行
$ more run-download.sh j=$1 for i in `cat $j`; do echo $i; /suikou/tool/sratoolkit.2.10.8-centos_linux64/bin/fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i | awk '{if(NR%4==1){if($0~"/1$"){flag=1}else{flag=0}}; if(flag==1){print $0}}'|head -n 40000|gzip -c > $j.fastq/$i.fastq.gz; done 2>&1 | tee $j.log for i in $j.fastq/*.fastq.gz; do n=`zcat $i|wc -l`; if [ $n = 0 ];then k=`basename $i .fastq.gz`; echo $k; curl http://`curl "https://www.ebi.ac.uk/ena/portal/ap i/filereport?accession=$k&result=read_run&fields=run_accession,fastq_ftp,fastq_md5,fastq_bytes"|tail -n 1|cut -f 2|cut -f 1 -d";"`|zcat|head -n 40000|gzip -c > $ i; fi; done 2>&1 | tee $j.ebi.log
改良版
j=$1 for i in `cat $j`; do n=`zcat $j.fastq/$i.fastq.gz|wc -l`; if [ $n = 0 ]; then echo $i; /suikou/tool/sratoolkit.2.10.8-centos_linux64/bin/fastq-dump --split-files --defline-seq '@$ac.$si/$ri' --defline-qual '+' -Z $i |paste - - - -|grep "^@"|awk -F'\t' 'function outseq(){if(old1!="" && length(oldstr)>=20){print old1; print old2; print "+"; print old4}; old1=""} {split($1,arr,"/"); str=$2; gsub(/[^ACGTacgt]/,"",str); if(arr[1]==old){if(length(str)>length(oldstr)){oldstr=str; old1=$1; old2=$2; old4=$4}}else{outseq(); old=arr[1]; oldstr=str; old1=$1; old2=$2; old4=$4}; length($2)} END{outseq()}'|head -n 40000|gzip -c > $j.fastq/$i.fastq.gz; fi done 2>&1 | tee -a $j.log for i in `cat $j`; do n=`zcat $j.fastq/$i.fastq.gz|wc -l`; if [ $n = 0 ]; then echo $i; curl http://`curl "https://www.ebi.ac.uk/ena/portal/api/filereport?accession=$i&result=read_run&fields=run_accession,fastq_ftp,fastq_md5,fastq_bytes"|tail -n 1|cut -f 2|cut -f 1 -d";"`|zcat|head -n 40000|gzip -c > $j.fastq/$i.fastq.gz; fi; done 2>&1 | tee -a $j.ebi.log
for i in SraId_??; do echo bash ./run-download.sh $i; done|xargs -I{} -P 7 bash -c "{}"