# Read a FASTA description

import operator
from Martel import *
from Bio import Std

### Parse dbxrefs given the NCBI|descr|line as explained
### in ftp://ncbi.nlm.nih.gov/blast/db/README and augmented
### by experience

def make_2id(s, dbname, primary_name, secondary_name):
    assert secondary_name is not None
    if primary_name is None:
        return Str(s + "||") + \
               Std.dbxref_dbid(UntilSep(sep = "| "), {"dbname": dbname,
                                                      "type": secondary_name})

    return Str(s + "|") + \
           Std.dbxref_dbid(UntilSep(sep = "|"), {"dbname": dbname,
                                                 "type": primary_name}) + \
           Str("|") + \
           Std.dbxref_dbid(UntilSep(sep = "| "), {"dbname": dbname,
                                                  "type": secondary_name})

def make_1id(s, dbname, name):
    return Str(s + "|") + \
           Std.dbxref_dbid(UntilSep(sep = "| "), {"dbname": dbname,
                                                  "type": name})
                                    
ids = []
# gene identifier              gi|id  # This isn't in the README
ids.append(make_1id("gi", "x-gi", "primary"))

# GenBank                      gb|accession|locus
# gb|U37104|APU37104
ids.append(make_2id("gb", "gb", "primary", "secondary"))

# EMBL Data Library            emb|accession|locus
# emb|F19596|HSPD04201
ids.append(make_2id("emb", "embl", "primary", "secondary"))

# DDBJ, DNA Database of Japan  dbj|accession|locus
ids.append(make_2id("dbj", "ddbj", "primary", "secondary"))

# NBRF PIR                     pir||entry
ids.append(make_2id("pir", "pir", None, "primary"))

# Protein Research Foundation  prf||name
ids.append(make_2id("prf", "x-prf", None, "primary"))

# SWISS-PROT                   sp|accession|entry name
ids.append(make_2id("sp", "sp", "primary", "secondary"))

# Brookhaven Protein Data Bank pdb|entry|chain
ids.append(make_2id("pdb", "x-pdb", "primary", "secondary"))  # XXX not correct

# Patents                      pat|country|number 
ids.append(make_2id("pat", "x-pat", "primary", "secondary"))  # XXX not correct

# GenInfo Backbone Id          bbs|number 
ids.append(make_1id("bbs", "x-bbs", "primary"))

# General database identifier  gnl|database|identifier
gnl_id = Str("gnl|") + \
         Std.dbxref_dbname(UntilSep(sep = "| ")) + \
         Str("|") + \
         Std.dbxref_dbid(UntilSep(sep = "| "))
ids.append(gnl_id)

# NCBI Reference Sequence      ref|accession|locus
ids.append(make_2id("ref", "x-ref", "primary", "secondary"))

# Local Sequence identifier    lcl|identifier
ids.append(make_1id("lcl", "local", "primary"))

# "|" them all together
ncbi_word = Std.dbxref(reduce(operator.or_, ids))

#ncbi_term = Assert(Re("[^ \R]+\|")) + \
ncbi_term =  ncbi_word + Rep(Str("|") + ncbi_word)

# Anything else
generic_term = Std.dbxref(
                 Std.dbxref_dbid(UntilSep(sep = " "), {"dbname": "local"})
               )
id_term = ncbi_term | generic_term
###########################################################

comment_lines = Rep(Str("#") + ToEol())
title = Str(">") + Std.description_line(id_term + UntilEol()) + AnyEol()
seqline = AssertNot(Str(">")) + Std.sequence(UntilEol()) + AnyEol()
# can get a sequence line without an Eol at the end of a file
seqline_nonewline = AssertNot(Str(">")) + Std.sequence(Word())

sequence = Std.sequence_block(Rep(seqline | seqline_nonewline))

record = Std.record(comment_lines + title + sequence + Rep(AnyEol()))

# define a format which reads records, but allows #-style comments in 
# the FASTA file
format = HeaderFooter("dataset", {"format": "fasta"},
                      comment_lines, RecordReader.Until, (">",),
                      record, RecordReader.StartsWith, (">",),
                      comment_lines, RecordReader.Everything, ()
                     )