Tricks on file formating
Manipulate fasta file
-
https://github.com/lh3/seqtk is extremely useful
-
Filter contigs shorter than 1000nt:
seqtk seq -L 1000 contigs.fa > contigs.gt1000.fa
# other functions include sampling sequence, etc.
- Convert one line fasta to multiline fasta
# reformat.sh provided in bbtools
reformat.sh fastawrap=70 in=input.fa out=output.fa
Download refseq sequences
#!/usr/bin/env python
from Bio import Entrez
import time
import argparse
import os
import logging
def main():
parser = argparse.ArgumentParser(description='Download genebank sequence')
parser.add_argument('--query', '-q',help="Genebank id for download",required=True)
parser.add_argument('--fasta','-f',help="Path to write the downloaded data",required=True)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
logger = logging.getLogger('Retrieve sequence')
Entrez.email = "xxx"
fout = open(args.fasta,"w")
try:
logger.info("Start retriving {} from ncbi nucleotide in fasta format ...".format(args.query))
handle = Entrez.efetch(db="nucleotide", id=args.query, rettype="fasta", retmode="text")
content = handle.read()
fout.write(content)
logger.info("Done.")
except:
logger.info("Error retriving {}, skip ...".format(args.query))
fout.close()
if __name__ == "__main__":
main()