"""Format from EMBL Nucleotide Sequence Database Release 65, December 2000 """ import Martel from Martel import RecordReader, Time from Bio import Std from Bio.expressions.swissprot import sprot38 whitespace = Martel.Spaces() ## ID - identification (begins each entry; 1 per entry) # ID entryname dataclass; molecule; division; sequencelength BP. divisions = Martel.Re("EST|PHG|FUN|GSS|HTC|HTG|HUM|INV|ORG|MAM|VRT|PLN|" + \ "PRO|ROD|SYN|STS|UNC|VRL|[A-Z]{3}") # XXX is found in S40706 ID_line = Martel.Str("ID ") + \ Std.dbid(Martel.UntilSep("entry_name", " "), {"type": "primary", "dbname": "embl"}) + \ whitespace + \ Martel.ToSep("dataclass", ";") + \ whitespace + \ Martel.Group("molecule", Std.alphabet(Martel.Str("DNA", "circular DNA"), {"alphabet": "iupac-ambiguous-dna"}) | Std.alphabet(Martel.Str("RNA", "circular RNA"), {"alphabet": "iupac-ambiguous-rna"}) | Std.alphabet(Martel.Str("XXX"), {"alphabet": "nucleotide"})) + \ Martel.Str("; ") + \ Martel.Group("division", divisions) + \ Martel.Str("; ") + \ Martel.Digits("length") + \ Martel.Str(" BP.") + \ Martel.AnyEol() ## AC - accession number (>=1 per entry) accession = Std.dbid(Martel.UntilSep("accession", ";"), {"type": "accession", "dbname": "embl"}) + Martel.Str(";") AC_line = Martel.Str("AC ") + \ accession + Martel.Rep(Martel.Str(" ") + accession) + \ Martel.AnyEol() AC_block = Martel.Rep1(AC_line) ## SV - sequence version (1 per entry) SV_line = Martel.Str("SV ") + \ Martel.Group("sequence_version", Martel.ToSep("accession", ".") + \ Martel.Digits("version")) + \ Martel.AnyEol() ## DT - date (2 per entry) date = Time.make_expression("%(day)-%(Jan)-%(year)") DT_created_line = Martel.Str("DT ") + \ Martel.Group("date_created", date) + \ Martel.Str(" (Rel. ") + \ Martel.Digits("release_created") + \ Martel.Str(", Created)") + \ Martel.AnyEol() DT_updated_line = Martel.Str("DT ") + \ Martel.Group("date_updated", date) + \ Martel.Str(" (Rel. ") + \ Martel.Digits("release_updated") + \ Martel.Str(", Last updated, Version ") + \ Martel.Digits("version_number") + \ Martel.Str(")") + \ Martel.AnyEol() DT_block = DT_created_line + DT_updated_line ## DE - description (>=1 per entry) DE_line = Martel.Str("DE ") + \ Std.description(Martel.UntilEol("description")) + \ Martel.AnyEol() DE_block = Std.description_block(Martel.Group("description_block", Martel.Rep1(DE_line))) ## KW - keyword (>=1 per entry) KW_line = Martel.Str("KW ") + \ Martel.ToEol("keyword_data") KW_block = Martel.Rep1(KW_line) ## OS - organism species (>=1 per entry) OS_block = sprot38.OS_block ## OC - organism classification (>=1 per entry) OC_block = sprot38.OC_block ## OG - organelle (0 or 1 per entry) OG_block = sprot38.OG_block organism = Martel.Group("organism", OS_block + \ OC_block + \ Martel.Opt(OG_block)) ## RN - reference number (>=1 per entry) ## RC - reference comment (>=0 per entry) ## RP - reference positions (>=1 per entry) ## RX - reference cross-reference (>=0 per entry) ## RA - reference author(s) (>=1 per entry) ## RT - reference title (>=1 per entry) ## RL - reference location (>=1 per entry) RN_line = sprot38.RN RC_block = sprot38.RC_block RP_line = sprot38.RP RX_line = sprot38.RX RX_block = Martel.Group("RX_block", Martel.Rep1(RX_line)) RA_block = sprot38.RA_block RT_block = sprot38.RT_block RL_block = sprot38.RL_block reference = Martel.Group("reference", RN_line + \ Martel.Opt(RC_block) + \ Martel.Opt(RP_line) + \ Martel.Opt(RX_block) + \ RA_block + \ RT_block + \ RL_block) ## DR - database cross-reference (>=0 per entry) DR_block = sprot38.DR_block ## FH - feature table header (0 or 2 per entry) FH_block = Martel.Str("FH Key Location/Qualifiers") + \ Martel.AnyEol() + \ Martel.Str("FH") + \ Martel.AnyEol() ## FT - feature table data (>=0 per entry) ##FT_line = Martel.Str("FT ") + \ ## Martel.ToEol("ft_data") ##FT_block = Martel.Rep1(FT_line) fq_dbxref = Std.feature_qualifier_name(Martel.Str("db_xref")) + \ Martel.Str('=') + \ Std.feature_qualifier_description( Martel.Str('"') + \ Std.dbxref(Std.dbxref_dbname(Martel.UntilSep(None, ":")) + \ Martel.Str(":") + \ Std.dbxref_dbid(Martel.UntilSep(None, '"'))) + \ Martel.Str('"')) + \ Martel.AnyEol() fq_generic = \ Martel.Assert(Martel.Word() + Martel.Str("=")) + \ Std.feature_qualifier_name(Martel.Word()) + \ Martel.Str("=") + \ Std.feature_qualifier_description(Martel.UntilEol()) + \ Martel.AnyEol() + \ Martel.Rep( Martel.Str("FT ") + \ (Martel.AssertNot(Martel.Str("/")) | Martel.AssertNot(Martel.Re(r"/\w+="))) + \ Std.feature_qualifier_description(Martel.UntilEol()) + \ Martel.AnyEol()) feature_qualifier = Std.feature_qualifier( Martel.Str("FT /") + \ (fq_dbxref | fq_generic)) feature = Std.feature( Martel.Str("FT ") + \ Std.feature_name(Martel.UntilSep(sep = " ")) + \ whitespace + \ Std.feature_location(Martel.UntilEol()) + \ Martel.AnyEol() + \ Martel.Rep(Martel.Str("FT ") + \ Martel.AssertNot(Martel.Str("/")) + \ Std.feature_location(Martel.UntilEol()) + \ Martel.AnyEol() ) + \ Martel.Rep(feature_qualifier) ) FT_block = Std.feature_block(Martel.Rep(feature), {"location-style": "genbank"}) ## CC - comments or notes (>=0 per entry) CC_line = Martel.Str("CC ") + \ Martel.ToEol("comment") CC_block = Martel.Rep1(CC_line) ## XX - spacer line (many per entry) XX = Martel.Str("XX") + Martel.AnyEol() ## SQ - sequence header (1 per entry) SQ_line = Martel.Str("SQ Sequence ") + \ Martel.Digits("num_BP") + \ Martel.Str(" BP; ") + \ Martel.Digits("num_A") + \ Martel.Str(" A; ") + \ Martel.Digits("num_C") + \ Martel.Str(" C; ") + \ Martel.Digits("num_G") + \ Martel.Str(" G; ") + \ Martel.Digits("num_T") + \ Martel.Str(" T; ") + \ Martel.Digits("num_other") + \ Martel.Str(" other;") + \ Martel.AnyEol() ## bb - (blanks) sequence data (>=1 per entry) SQ_data = Martel.Str(" ") + \ Std.sequence(Martel.Re(".{65}")) + \ whitespace + \ Martel.Digits("end_position") + \ Martel.AnyEol() SQ_block = Std.sequence_block(SQ_line + Martel.Rep1(SQ_data)) ## // - termination line (ends each entry; 1 per entry) end = Martel.Str("//") + Martel.AnyEol() record = Martel.Group("record", \ ID_line + \ Martel.Opt(XX) + \ AC_block + \ Martel.Opt(XX) + \ SV_line + \ Martel.Opt(XX) + \ DT_block + \ Martel.Opt(XX) + \ DE_block + \ Martel.Opt(XX) + \ KW_block + \ Martel.Opt(XX) + \ Martel.Rep1(organism + Martel.Opt(XX)) + \ Martel.Rep(reference + Martel.Opt(XX)) + \ Martel.Opt(DR_block + \ Martel.Opt(XX)) + \ Martel.Rep(CC_block + \ Martel.Opt(XX)) + \ FH_block + \ FT_block + \ Martel.Opt(XX) + \ SQ_block + \ end, {"format": "embl/65"}) format_expression = Martel.Group("dataset", Martel.Rep1(record), {"format": "embl/65"}) format = Martel.ParseRecords("dataset", {"format": "embl/65"}, record, RecordReader.EndsWith, ("//\n",) )