/****************************************************************************** * $Id: mlex.c,v 1.7 2005/10/24 22:43:58 gareuselesinge Exp $ * This file is part of liberopops (http://liberopops.sf.net) * * This file is distributed under the terms of GNU GPL license. * ******************************************************************************/ /****************************************************************************** * File description: * Markup Language EXpressions * Notes: * This version supports optional tags/strings and the backtrack * tends to be a bit slower. * Authors: * Enrico Tassi ******************************************************************************/ #include #include #include "regularexp.h" #include "list.h" #include "mlex.h" #include "log.h" #define LOG_ZONE "MLEX" //#define DEBUG_MLEX 1 #define HIDDEN static /*** * * * * * * * * * * * * * * * * * * * * * * * * */ /*** local types/macro * * * * * * * * * * * * * * * * * * */ /*** * * * * * * * * * * * * * * * * * * * * * * * * */ #define TOK_STR 0 #define TOK_TAG 1 #define TOK_OPT_TAG 2 #define TOK_OPT_STR 4 #define MATCHES 1 #define NOT_MATCHES 0 //! used to represent answers struct answer_t { list_t* start; int len; list_t* dust_lengths; }; //! used for tokenization struct token_t { int start,stop; short tag; int dustlen; }; //! used for backtracking struct back_t { list_t* dust_lengths; list_t* position_stream; list_t* position_expr; int len; }; /*** * * * * * * * * * * * * * * * * * * * * * * * * */ /*** prototypes divided by section * * * * * * * * * * * * * * */ /*** * * * * * * * * * * * * * * * * * * * * * * * * */ /*** tokenization ***/ HIDDEN void regexec_my(char* s,regmatch_t *p, int *bag); HIDDEN void regexec_myext(char* s,regmatch_t *p); HIDDEN list_t *tokenize_html(char* t); HIDDEN list_t *tokenize_exp(char* t); /*** matching ***/ HIDDEN regmatch_t token_match(struct token_t* t,char* s,char* exp); HIDDEN int token_match_token(struct token_t* t,char* s,struct token_t *t1, char* s1); HIDDEN unsigned int mlmatch_find(list_t *data,list_t* pattern,char* str, char* exp,list_t*saved_pattern,list_t*stack,int len); HIDDEN int is_a_keep(void* x,char* str); /*** result refinement ***/ HIDDEN list_t* epurate(list_t* data,int len,list_t* dl,char* txt,list_t* get, char* str); HIDDEN list_t* mlmatch_epurate(list_t* ans,char* txt, list_t* get,char* str); /*** result handling ***/ HIDDEN struct chunk_t* mlmatch_get_x(list_t* l,int pos); HIDDEN list_t* mlmatch_get_y(list_t* l,int pos); HIDDEN char *mlmatch_get(struct chunk_t*c,char* str); /*** aux ***/ HIDDEN void mlmatch_print_results_aux(list_t*res,char* str); HIDDEN list_t* mlmatch_aux(list_t *data,list_t* pattern,char* str,char* exp, int min_len); /*** helpers ***/ HIDDEN void restore_dusts(list_t* orig, list_t* copy); HIDDEN list_t * copy_dusts(list_t *orig); HIDDEN void clean_stack(list_t* stack); HIDDEN void reset_dustlen(list_t* pattern); HIDDEN int exp_min_len(list_t* l); /*** free ***/ HIDDEN void free_sublist(void* l); HIDDEN void free_answer(void *f); /*** new ***/ HIDDEN void * new_int(int i); HIDDEN list_t* new_dust_lengths(list_t* exp); HIDDEN struct chunk_t * new_chunk(int start,int stop); HIDDEN struct answer_t* new_answer(list_t* start,int len,list_t* exp); HIDDEN struct token_t *new_token(int start,int stop,short tags); /*** print ***/ HIDDEN void print_token(struct token_t *c, char *s); HIDDEN void print_toklist(list_t*l, char *s, int i); HIDDEN void print_toklistn(list_t*l, char *s,int n); HIDDEN void print_chunk(struct chunk_t*c, char* str); #ifdef DEBUG_MLEX HIDDEN void print_int(void*x); #endif HIDDEN void print_anslist(list_t *a,char* str); /*** * * * * * * * * * * * * * * * * * * * * * * * * */ /*** the code * * * * * * * * * * * * * * * * * * * * * */ /*** * * * * * * * * * * * * * * * * * * * * * * * * */ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // section: tokenization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ /*** * * * * * * * * * * * * * * * * * * * * * * * * ** * to be used wit an HTML string, finds the fist token * a token is a tag, or a comment or a * */ #define MODE_SCRIPT 1 #define MODE_PLAIN 0 HIDDEN __inline__ int scriptmatch(char* s){ int pos = 0; while (s[pos] == ' ') pos++; if (s[pos] == '/') pos++; if (s[pos] == '\0' || (s[pos] != 's' && s[pos] != 'S')) return MODE_PLAIN; pos++; if (s[pos] == '\0' || (s[pos] != 'c' && s[pos] != 'C')) return MODE_PLAIN; pos++; if (s[pos] == '\0' || (s[pos] != 'r' && s[pos] != 'R')) return MODE_PLAIN; pos++; if (s[pos] == '\0' || (s[pos] != 'i' && s[pos] != 'I')) return MODE_PLAIN; pos++; if (s[pos] == '\0' || (s[pos] != 'p' && s[pos] != 'P')) return MODE_PLAIN; pos++; if (s[pos] == '\0' || (s[pos] != 't' && s[pos] != 'T')) return MODE_PLAIN; return MODE_SCRIPT; } HIDDEN void regexec_my(char* s,regmatch_t *p, int* mode) { int pos; int dust; int sm = MODE_PLAIN; p->begin = -1; p->end = -1; for(pos = 0 ; !( s[pos] == '\0' || (s[pos] == '<' && (((sm=scriptmatch(&s[pos+1])) == MODE_SCRIPT) || *mode == MODE_PLAIN ))) ; pos++); if(s[pos] == '\0') return; #ifdef DEBUG_MLEX printf("STOP: s[pos] == %c, *mode = %d, next are %c%c%c%c%c%c sm = %d\n", s[pos],*mode,s[pos+1],s[pos+2],s[pos+3],s[pos+4],s[pos+5],s[pos+6],sm); #endif p->begin = pos; dust=0; if(*mode != MODE_SCRIPT && !strncmp(&s[pos],"