%{ #include "threestatemodel.h" #include "pfamhmmer1db.h" typedef enum { TSMDB_UNKNOWN, TSMDB_SINGLE, TSMDB_HMMER1PFAM, TSMDB_PROTEIN, TSMDB_GENERIC } TSMDB_Type; %} friend ThreeStateDB friend PfamHmmer1DB struct ThreeStateDB int dbtype !def="TSMDB_UNKNOWN" char * filename int type !def="TSM_default" !hidden FILE * current_file !link !def="NULL" RandomModel * rm // NB, this is hard-linked... long byte_position // this is the file position for the current model ThreeStateModel * single // for single db cases PfamHmmer1DB * phdb // for Pfam Hmmer1 style databases. SequenceDB * sdb // for protein databases CompMat * comp // for protein databases int gap // for protein databases int ext // for protein databases Sequence * seq_cache // needed for a bit of inter-function communication ThreeStateModel * (*reload_generic)(ThreeStateDB * tdb,int * return_status) !func // for generic parsing applications... boolean (*open_generic)(ThreeStateDB * tdb) !func boolean (*close_generic)(ThreeStateDB * tdb) !func boolean (*dataentry_add)(ThreeStateDB * tdb,DataEntry * en) !func boolean (*open_index_generic)(ThreeStateDB *tdb) !func ThreeStateModel * (*index_generic)(ThreeStateDB *tdb,DataEntry *de) !func boolean (*close_index_generic)(ThreeStateDB *tdb) !func void * data !link // whatever else the damn system wants to carry around with it! %info ThreeStateDB is the object that represents a database of profile-HMMs. The object hold a variety of fields on some of which are occupied depending on the type. Realistically we need a more abstract class idea, which is implemented here anyway via the generic stuff, in hacky C-style pointers to function plus a void pointer. This object therefore houses a switch system around the different types including the generic system... but as the generic function stuff was bolted on later, some things are handled with explicit datastructures. It is quite messy ;). Apologies. To be cleaned up. The generic stuff was principly added in to allow a decoupling of this module from the HMMer2.o interface code which is held in wise2xhmmer.dy The old static datastructure code can be made via protein sequences which are then converted or a Pfam 2.0 style directory + HMMs file. %% api ThreeStateDB object ThreeStateDB des free_ThreeStateDB func indexed_ThreeStateModel_ThreeStateDB func new_proteindb_ThreeStateDB func new_PfamHmmer1DB_ThreeStateDB func new_single_ThreeStateDB endobject endapi %{ #include "threestatedb.h" %func opens database ready for calls to /indexed_ThreeStateModel_ThreeStateDB %arg mdb database to open for index calls %% boolean open_for_indexing_ThreeStateDB(ThreeStateDB * mdb) { switch(mdb->dbtype) { case TSMDB_SINGLE : return TRUE; case TSMDB_HMMER1PFAM : return TRUE; /* name is the index! */ case TSMDB_PROTEIN : return TRUE; /* sequence db's don't need opening for indexing */ case TSMDB_GENERIC : return ((*mdb->open_index_generic)(mdb)); default : warn("Unknown threestatemodel db type."); return FALSE; } } %func closes an indexable database %% boolean close_for_indexing_ThreeStateDB(ThreeStateDB * mdb) { switch(mdb->dbtype) { case TSMDB_GENERIC : return ((*mdb->close_index_generic)(mdb)); default : return TRUE; } } %func Set the search type of this threestatedb... %arg type to set can be any of the modes found in threestatemodel %% boolean set_search_type_ThreeStateDB(ThreeStateDB * tdb,char * type) { int ret; ret = threestatemodel_mode_from_string(type); if( ret == TSM_unknown ) { /* warning already issued */ return FALSE; } tdb->type = ret; return TRUE; } %func Retrieves a model from a database which has been opened for indexing by /open_for_indexing_ThreeStateDB The index information comes from the dataentry which should have been from a search of the ThreeStateDB. %simple indexed_model %arg mdb database where this is indexed en dataentry to pull the model from %% ThreeStateModel * indexed_ThreeStateModel_ThreeStateDB(ThreeStateDB * mdb,DataEntry * en) { Sequence * seq; Protein * pro; ThreeStateModel * tsm; switch(mdb->dbtype) { case TSMDB_SINGLE : return hard_link_ThreeStateModel(mdb->single); case TSMDB_HMMER1PFAM : tsm = ThreeStateModel_from_name_PfamHmmer1DB(mdb->phdb,en->name); set_startend_policy_ThreeStateModel(tsm,mdb->type,30,0.2); return tsm; case TSMDB_PROTEIN : seq = get_Sequence_from_SequenceDB(mdb->sdb,en); if( seq == NULL ) { warn("could not retrieve %s as a sequence from database",en->name); return NULL; } pro = Protein_from_Sequence(seq); if( pro == NULL ) { warn("Could not convert sequence to a protein. Exiting!"); return NULL; } /* convert protein to threestatemodel */ tsm = ThreeStateModel_from_half_bit_Sequence(pro,mdb->comp,mdb->rm,mdb->gap,mdb->ext); if( tsm == NULL ) { warn("Could not convert protein to threestatemode. Exiting!"); free_Protein(pro); return NULL; } free_Protein(pro); /* DB status already set by seqdb */ return tsm; case TSMDB_GENERIC : tsm = ((*mdb->index_generic)(mdb,en)); if( tsm == NULL ) { return NULL; } /* fprintf(stdout,"Setting %d as policy\n",mdb->type); */ set_startend_policy_ThreeStateModel(tsm,mdb->type,30,0.2); return tsm; default : warn("Unknown threestatedb type"); return NULL; } warn("Should never get here - in threestatedb reload!"); return NULL; } %func makes a new ThreeStateDB from a sequencedb (better be protein!) %arg sdb r sequence database to use comp r comparison matrix to use gap r gap open penalty ext r gap extensions penalty %% ThreeStateDB * new_proteindb_ThreeStateDB(SequenceDB * sdb,CompMat * comp,int gap,int ext) { ThreeStateDB * out; out = ThreeStateDB_alloc(); out->sdb = hard_link_SequenceDB(sdb); out->comp = hard_link_CompMat(comp); out->gap = gap; out->ext = ext; out->rm = default_RandomModel(); out->dbtype = TSMDB_PROTEIN; return out; } %func Making a new ThreeStateDB from a single model %arg tsm r a single ThreeStateModel rm r random model to be used in comparisons.. %% ThreeStateDB * new_single_ThreeStateDB(ThreeStateModel * tsm,RandomModel * rm) { ThreeStateDB * out; out = ThreeStateDB_alloc(); out->single = hard_link_ThreeStateModel(tsm); if( tsm->rm == NULL ) out->rm = hard_link_RandomModel(rm); else out->rm = hard_link_RandomModel(tsm->rm); out->dbtype = TSMDB_SINGLE; return out; } %func Makes a new PfamHmmer1DB from a filename indicating the directory %% ThreeStateDB * new_PfamHmmer1DB_ThreeStateDB(char * dirname) { ThreeStateDB * out; out = ThreeStateDB_alloc(); out->phdb = PfamHmmer1DB_from_dirname(dirname); out->dbtype = TSMDB_HMMER1PFAM; return out; } %func This function adds the internal entry information (eg indexing point) into the dataentry %% boolean dataentry_add_ThreeStateDB(DataEntry * de,ThreeStateScore * tss,ThreeStateDB * mdb) { switch(mdb->dbtype) { case TSMDB_SINGLE : de->name = stringalloc(mdb->single->name); return TRUE; case TSMDB_HMMER1PFAM : if( tss == NULL ) { } else { de->name = stringalloc(tss->name); } return TRUE; case TSMDB_PROTEIN : add_SequenceDB_info_DataEntry(mdb->sdb,de); return TRUE; case TSMDB_GENERIC : if( (*mdb->dataentry_add)(mdb,de) == FALSE ) { warn("Could not add dataentry info to the entry %s",tss->name); return FALSE; } else { return TRUE; } default : warn("Unknown threestatedb type"); return FALSE; } return TRUE; } %func Open function for ThreeStateDB. An internal for this file but also used by, for example, GeneWiseDB that wants to get at the underlying models, not the log-odds. %% boolean open_ThreeStateDB(ThreeStateDB * mdb) { int ret; switch( mdb->dbtype ) { case TSMDB_SINGLE : return TRUE; /* should be fine! */ case TSMDB_HMMER1PFAM : if( mdb->phdb == NULL ) { warn("No hmmer1 db to open for threestatedb!"); return FALSE; } mdb->phdb->cur = 0; return TRUE; case TSMDB_PROTEIN : if( mdb->sdb == NULL ) { warn("Attempting to open a protein tsm with no sequence db!"); return FALSE; } mdb->seq_cache = init_SequenceDB(mdb->sdb,&ret); if( ret == DB_RETURN_ERROR ) { return FALSE; } if( ret == DB_RETURN_END ) { warn("Due to some bad coding, can't cope with single protein databases in tsmdbs. oooops!"); } return TRUE; case TSMDB_GENERIC : return ((*mdb->open_generic)(mdb)); default : warn("Got an unrecognisable tsm db type in opening tsm %d",mdb->dbtype); return FALSE; } return FALSE; } %func Reads a threestatemodel out from the database. People will probably want the ThreeStateScore *not* the model, but some systems will want the model. %% ThreeStateModel * read_TSM_ThreeStateDB(ThreeStateDB * mdb,int * return_status) { ThreeStateModel * tsm; Protein * pro; Sequence * seq; switch( mdb->dbtype ) { case TSMDB_SINGLE : *return_status = DB_RETURN_END; if( mdb->single->rm == NULL ) { warn("Threestate model without an internal random model!"); mdb->single->rm = hard_link_RandomModel(mdb->rm); } return hard_link_ThreeStateModel(mdb->single); case TSMDB_HMMER1PFAM : tsm= read_next_TSM_PfamHmmer1DB(mdb->phdb,return_status); set_startend_policy_ThreeStateModel(tsm,mdb->type,30,0.2); return tsm; case TSMDB_PROTEIN : if( mdb->seq_cache != NULL ) { /* just after an open. Should actually use this sequence, and flush the cache */ pro = Protein_from_Sequence(hard_link_Sequence(mdb->seq_cache)); mdb->seq_cache = free_Sequence(mdb->seq_cache); *return_status = DB_RETURN_OK; } else { /* reload a sequence from a database */ seq = reload_SequenceDB(NULL,mdb->sdb,return_status); /* exit now if error */ if( *return_status == DB_RETURN_ERROR ) { return NULL; /* might have leaked memory. Ugh! */ } /* if we get NULL... for the moment, silent flag end */ if( seq == NULL ) { *return_status = DB_RETURN_END; return NULL; } pro = Protein_from_Sequence(seq); } if( pro == NULL ) { warn("Could not convert sequence to a protein. Exiting!"); *return_status = DB_RETURN_ERROR; return NULL; } /* convert protein to threestatemodel */ tsm = ThreeStateModel_from_half_bit_Sequence(pro,mdb->comp,mdb->rm,mdb->gap,mdb->ext); if( tsm == NULL ) { warn("Could not convert protein to threestatemode. Exiting!"); free_Protein(pro); *return_status = DB_RETURN_ERROR; return NULL; } /* DB status already set by seqdb */ return tsm; case TSMDB_GENERIC : tsm = ((*mdb->reload_generic)(mdb,return_status)); if( tsm == NULL ) { return NULL; /* means end of database */ } set_startend_policy_ThreeStateModel(tsm,mdb->type,30,0.2); return tsm; default : warn("Got an unrecognisable tsm db type in read-load"); return NULL; } } %func Init function for ThreeStateDB Is going to open file, read first model, complain if NULL, and convert to a score system. %arg mdb rw Model database return_status w return from database.h system %% ThreeStateScore * init_ThreeStateDB(ThreeStateDB * mdb,int * return_status) { ThreeStateModel * tsm; ThreeStateScore * tss; if( open_ThreeStateDB(mdb) == FALSE) { warn("Could not open ThreeStateDB, hence could not init it!"); *return_status = DB_RETURN_ERROR; return NULL; } tsm = read_TSM_ThreeStateDB(mdb,return_status); if( *return_status == DB_RETURN_ERROR) return NULL; set_startend_policy_ThreeStateModel(tsm,mdb->type,30,0.2); fold_RandomModel_into_ThreeStateModel(tsm,mdb->rm); tss = ThreeStateScore_from_ThreeStateModel(tsm); free_ThreeStateModel(tsm); *return_status = DB_RETURN_OK; return tss; } %func reloads the ThreeStateDB. Frees the previous score system (could recycle memory). Reads database. calls END if gets NULL from read_HMF_ThreeStateModel %arg tss the previous score system mdb model database system return_status w return from database.h system %% ThreeStateScore * reload_ThreeStateDB(ThreeStateScore * prev,ThreeStateDB * mdb,int * return_status) { ThreeStateModel * tsm; ThreeStateScore * tss; free_ThreeStateScore(prev); if( mdb->dbtype == TSMDB_SINGLE ) { *return_status = DB_RETURN_END; return NULL; } tsm = read_TSM_ThreeStateDB(mdb,return_status); if( *return_status != DB_RETURN_OK) return NULL; set_startend_policy_ThreeStateModel(tsm,mdb->type,30,0.2); fold_RandomModel_into_ThreeStateModel(tsm,mdb->rm); tss = ThreeStateScore_from_ThreeStateModel(tsm); free_ThreeStateModel(tsm); *return_status = DB_RETURN_OK; return tss; } %func closes ThreeStateDB At the moment, only needs to free previous and close the file %arg prev the last ThreeStateScore to be freed mdb Model database %% boolean close_ThreeStateDB(ThreeStateScore * prev,ThreeStateDB * mdb) { if( prev != NULL ) free_ThreeStateScore(prev); if( mdb == NULL ) { warn("Trying to close a NULL threestatedb - considering this an error!"); return FALSE; } switch(mdb->dbtype) { case TSMDB_SINGLE : return TRUE; case TSMDB_HMMER1PFAM : fclose(mdb->current_file); mdb->current_file = NULL; return TRUE; /* name is the index! */ case TSMDB_PROTEIN : close_SequenceDB(NULL,mdb->sdb); return TRUE; /* sequence db's don't need opening for indexing */ case TSMDB_GENERIC : return ((*mdb->close_index_generic)(mdb)); default : warn("Unknown threestatemodel db type."); return FALSE; } warn("Should never get here!"); return FALSE; } %}