/* Generated by re2c 0.5 on Sun Mar 7 13:35:34 2004 */ #line 1 "/home/wez/src/php/pecl/mailparse/php_mailparse_rfc822.re" /* +----------------------------------------------------------------------+ | PHP Version 4 | +----------------------------------------------------------------------+ | Copyright (c) 1997-2004 The PHP Group | +----------------------------------------------------------------------+ | This source file is subject to version 2.02 of the PHP license, | | that is bundled with this package in the file LICENSE, and is | | available at through the world-wide-web at | | http://www.php.net/license/2_02.txt. | | If you did not receive a copy of the PHP license and are unable to | | obtain it through the world-wide-web, please send a note to | | license@php.net so we can mail you a copy immediately. | +----------------------------------------------------------------------+ | Author: Wez Furlong | +----------------------------------------------------------------------+ */ /* $Id: php_mailparse_rfc822.c,v 1.14 2004/12/28 18:55:20 wez Exp $ */ #include "php.h" #include "php_mailparse.h" #include "php_mailparse_rfc822.h" #include "ext/standard/php_string.h" #include "ext/standard/php_smart_str.h" #line 39 #line 48 #define YYFILL(n) if (YYCURSOR == YYLIMIT) goto stop #define YYCTYPE unsigned char #define YYCURSOR p #define YYLIMIT q #define YYMARKER r #define DEBUG_RFC822_SCANNER 0 #if DEBUG_RFC822_SCANNER # define DBG_STATE(lbl) printf(lbl " %d:%c %d:%c\n", *YYCURSOR, *YYCURSOR, *start, *start) #else # define DBG_STATE(lbl) #endif #define ADD_ATOM_TOKEN() do { if (tokens) { tokens->token = *start; tokens->value = start; tokens->valuelen = 1; tokens++; } ++*ntokens; } while (0) #define REPORT_ERR(msg) do { if (report_errors) zend_error(E_WARNING, "input is not rfc822 compliant: %s", msg); } while(0) /* Tokenize a header. tokens may be NULL, in which case the number of tokens are counted, allowing the caller to allocate enough room */ static void tokenize(const char *header, php_rfc822_token_t *tokens, int *ntokens, int report_errors TSRMLS_DC) { register const char *p, *q, *start; int in_bracket = 0; /* NB: parser assumes that the header has two bytes of NUL terminator */ YYCURSOR = header; YYLIMIT = YYCURSOR + strlen(YYCURSOR) + 1; *ntokens = 0; state_ground: start = YYCURSOR; #if DEBUG_RFC822_SCANNER printf("ground: start=%p limit=%p cursor=%p: [%d] %s\n", start, YYLIMIT, YYCURSOR, *YYCURSOR, YYCURSOR); #endif { YYCTYPE yych; unsigned int yyaccept; static unsigned char yybm[] = { 0, 192, 192, 192, 192, 192, 192, 192, 192, 96, 96, 192, 192, 96, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 96, 64, 0, 192, 192, 64, 192, 192, 64, 64, 192, 192, 64, 192, 64, 64, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 64, 64, 64, 64, 64, 64, 64, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 64, 192, 64, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, 192, }; goto yy0; yy1: ++YYCURSOR; yy0: if((YYLIMIT - YYCURSOR) < 2) YYFILL(2); yych = *YYCURSOR; if(yybm[0+yych] & 32) goto yy4; if(yych <= '-'){ if(yych <= '%'){ if(yych <= '!'){ if(yych <= '\000') goto yy2; if(yych <= ' ') goto yy21; goto yy19; } else { if(yych <= '"') goto yy12; if(yych <= '$') goto yy21; goto yy19; } } else { if(yych <= ')'){ if(yych <= '\'') goto yy21; if(yych <= '(') goto yy10; goto yy7; } else { if(yych == ',') goto yy19; goto yy21; } } } else { if(yych <= '>'){ if(yych <= ';'){ if(yych <= '/') goto yy19; if(yych <= '9') goto yy21; goto yy19; } else { if(yych <= '<') goto yy15; if(yych <= '=') goto yy19; goto yy17; } } else { if(yych <= '['){ if(yych <= '@') goto yy19; if(yych <= 'Z') goto yy21; goto yy19; } else { if(yych <= '\\') goto yy9; if(yych <= ']') goto yy19; goto yy21; } } } yy2: yych = *++YYCURSOR; yy3: #line 88 { goto stop; } yy4: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; yy5: if(yybm[0+yych] & 32) goto yy4; yy6: #line 89 { DBG_STATE("SPACE"); goto state_ground; } yy7: yych = *++YYCURSOR; yy8: #line 90 { REPORT_ERR("token not valid in ground state"); goto state_ground; } yy9: yych = *++YYCURSOR; if(yybm[0+yych] & 128) goto yy21; goto yy8; yy10: yych = *++YYCURSOR; yy11: #line 91 { DBG_STATE("START COMMENT"); if (tokens) { tokens->token = '('; tokens->value = start; tokens->valuelen = 0; } goto state_comment; } yy12: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; yy13: if(yybm[0+yych] & 64) goto yy12; if(yych >= '\001') goto yy26; yy14:yy15: yych = *++YYCURSOR; if(yych == '>') goto yy24; yy16: #line 123 { DBG_STATE("LANGLE"); if (in_bracket) { REPORT_ERR("already in < bracket"); goto state_ground; } in_bracket = 1; ADD_ATOM_TOKEN(); goto state_ground; } yy17: yych = *++YYCURSOR; yy18: #line 132 { DBG_STATE("RANGLE"); if (!in_bracket) { REPORT_ERR("not in < bracket"); goto state_ground; } in_bracket = 0; ADD_ATOM_TOKEN(); goto state_ground; } yy19: yych = *++YYCURSOR; yy20: #line 141 { DBG_STATE("ATOM"); ADD_ATOM_TOKEN(); goto state_ground; } yy21: ++YYCURSOR; if(YYLIMIT == YYCURSOR) YYFILL(1); yych = *YYCURSOR; yy22: if(yybm[0+yych] & 128) goto yy21; yy23: #line 142 { DBG_STATE("ANY"); if (tokens) { tokens->token = 0; tokens->valuelen = YYCURSOR - start; tokens->value = start; tokens++; } ++*ntokens; goto state_ground; } yy24: yych = *++YYCURSOR; yy25: #line 110 { DBG_STATE("NULL <>"); ADD_ATOM_TOKEN(); if (tokens) { tokens->token = 0; tokens->value = ""; tokens->valuelen = 0; tokens++; } ++*ntokens; start++; ADD_ATOM_TOKEN(); goto state_ground; } yy26: yych = *++YYCURSOR; yy27: #line 99 { DBG_STATE("QUOTE STRING"); if (tokens) { tokens->token = '"'; tokens->value = start + 1; tokens->valuelen = YYCURSOR - start - 2; tokens++; } ++*ntokens; goto state_ground; } } #line 152 state_comment: { int comment_depth = 1; while (1) { if (*YYCURSOR == 0) { /* unexpected end of header */ REPORT_ERR("unexpected end of header"); /* fake a quoted string for this last token */ if (tokens) tokens->token = '"'; ++*ntokens; return; } else if (*YYCURSOR == '(') { comment_depth++; } else if (*YYCURSOR == ')' && --comment_depth == 0) { /* end of nested comment sequence */ YYCURSOR++; if (tokens) tokens->valuelen++; break; } else if (*YYCURSOR == '\\' && YYCURSOR[1]) { YYCURSOR++; if (tokens) tokens->valuelen++; } YYCURSOR++; } if (tokens) { tokens->valuelen = YYCURSOR - tokens->value; tokens++; } ++*ntokens; goto state_ground; } stop: #if DEBUG_RFC822_SCANNER printf("STOPing parser ntokens=%d YYCURSOR=%p YYLIMIT=%p start=%p cursor=[%d] %s start=%s\n", *ntokens, YYCURSOR, YYLIMIT, start, *YYCURSOR, YYCURSOR, start); #else ; #endif } PHP_MAILPARSE_API php_rfc822_tokenized_t *php_mailparse_rfc822_tokenize(const char *header, int report_errors TSRMLS_DC) { php_rfc822_tokenized_t *toks = ecalloc(1, sizeof(php_rfc822_tokenized_t)); int len = strlen(header); toks->buffer = emalloc(len + 2); strcpy(toks->buffer, header); toks->buffer[len] = 0; toks->buffer[len+1] = 0; /* mini hack: the parser sometimes relies in this */ tokenize(toks->buffer, NULL, &toks->ntokens, report_errors TSRMLS_CC); toks->tokens = toks->ntokens ? ecalloc(toks->ntokens, sizeof(php_rfc822_token_t)) : NULL; tokenize(toks->buffer, toks->tokens, &toks->ntokens, report_errors TSRMLS_CC); return toks; } PHP_MAILPARSE_API void php_rfc822_tokenize_free(php_rfc822_tokenized_t *toks) { if (toks->tokens) efree(toks->tokens); efree(toks->buffer); efree(toks); } PHP_MAILPARSE_API char *php_rfc822_recombine_tokens(php_rfc822_tokenized_t *toks, int first_token, int n_tokens, int flags) { char *ret = NULL; int i, upper, last_was_atom = 0, this_is_atom = 0, tok_equiv; size_t len = 1; /* for the NUL terminator */ upper = first_token + n_tokens; if (upper > toks->ntokens) upper = toks->ntokens; for (i = first_token; i < upper; i++, last_was_atom = this_is_atom) { tok_equiv = toks->tokens[i].token; if (tok_equiv == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES) tok_equiv = '"'; if (flags & PHP_RFC822_RECOMBINE_IGNORE_COMMENTS && tok_equiv == '(') continue; if (flags & PHP_RFC822_RECOMBINE_COMMENTS_ONLY && tok_equiv != '(' && !(toks->tokens[i].token == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES)) continue; this_is_atom = php_rfc822_token_is_atom(toks->tokens[i].token); if (this_is_atom && last_was_atom && flags & PHP_RFC822_RECOMBINE_SPACE_ATOMS) len++; /* allow room for a space */ if (flags & PHP_RFC822_RECOMBINE_INCLUDE_QUOTES && tok_equiv == '"') len += 2; len += toks->tokens[i].valuelen; } last_was_atom = this_is_atom = 0; ret = emalloc(len); for (i = first_token, len = 0; i < upper; i++, last_was_atom = this_is_atom) { const char *tokvalue; int toklen; tok_equiv = toks->tokens[i].token; if (tok_equiv == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES) tok_equiv = '"'; if (flags & PHP_RFC822_RECOMBINE_IGNORE_COMMENTS && tok_equiv == '(') continue; if (flags & PHP_RFC822_RECOMBINE_COMMENTS_ONLY && tok_equiv != '(' && !(toks->tokens[i].token == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES)) continue; tokvalue = toks->tokens[i].value; toklen = toks->tokens[i].valuelen; this_is_atom = php_rfc822_token_is_atom(toks->tokens[i].token); if (this_is_atom && last_was_atom && flags & PHP_RFC822_RECOMBINE_SPACE_ATOMS) { ret[len] = ' '; len++; } if (flags & PHP_RFC822_RECOMBINE_INCLUDE_QUOTES && tok_equiv == '"') ret[len++] = '"'; if (toks->tokens[i].token == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES) { /* don't include ( and ) in the output string */ tokvalue++; toklen -= 2; } memcpy(ret + len, tokvalue, toklen); len += toklen; if (flags & PHP_RFC822_RECOMBINE_INCLUDE_QUOTES && tok_equiv == '"') ret[len++] = '"'; } ret[len] = 0; if (flags & PHP_RFC822_RECOMBINE_STRTOLOWER) php_strtolower(ret, len); return ret; } static void parse_address_tokens(php_rfc822_tokenized_t *toks, php_rfc822_addresses_t *addrs, int *naddrs) { int start_tok = 0, iaddr = 0, i, in_group = 0, group_lbl_start, group_lbl_end; int a_start, a_count; /* position and count for address part of a name */ smart_str group_addrs = { 0, }; char *address_value = NULL; address: /* mailbox / group */ if (start_tok >= toks->ntokens) { /* the end */ *naddrs = iaddr; smart_str_free(&group_addrs); return; } /* look ahead to determine if we are dealing with a group */ for (i = start_tok; i < toks->ntokens; i++) if (toks->tokens[i].token != 0 && toks->tokens[i].token != '"') break; if (i < toks->ntokens && toks->tokens[i].token == ':') { /* it's a group */ in_group = 1; group_lbl_start = start_tok; group_lbl_end = i; /* we want the address for the group to include the leading ":" and the trailing ";" */ start_tok = i; } mailbox: /* addr-spec / phrase route-addr */ if (start_tok >= toks->ntokens) { /* the end */ *naddrs = iaddr; smart_str_free(&group_addrs); return; } /* skip spurious commas */ while (start_tok < toks->ntokens && (toks->tokens[start_tok].token == ',' || toks->tokens[start_tok].token == ';')) start_tok++; /* look ahead: if we find a '<' before we find an '@', we are dealing with a route-addr, otherwise we have an addr-spec */ for (i = start_tok; i < toks->ntokens && toks->tokens[i].token != ';' && toks->tokens[i].token != ',' && toks->tokens[i].token != '<'; i++) ; /* the stuff from start_tok to i - 1 is the display name part */ if (addrs && !in_group && i - start_tok > 0) { int j, has_comments = 0, has_strings = 0; switch(toks->tokens[i].token) { case ';': case ',': case '<': addrs->addrs[iaddr].name = php_rfc822_recombine_tokens(toks, start_tok, i - start_tok, PHP_RFC822_RECOMBINE_SPACE_ATOMS); break; default: /* it's only the display name if there are quoted strings or comments in there */ for (j = start_tok; j < i; j++) { if (toks->tokens[j].token == '(') has_comments = 1; if (toks->tokens[j].token == '"') has_strings = 1; } if (has_comments && !has_strings) { addrs->addrs[iaddr].name = php_rfc822_recombine_tokens(toks, start_tok, i - start_tok, PHP_RFC822_RECOMBINE_SPACE_ATOMS | PHP_RFC822_RECOMBINE_COMMENTS_ONLY | PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES ); } else if (has_strings) { addrs->addrs[iaddr].name = php_rfc822_recombine_tokens(toks, start_tok, i - start_tok, PHP_RFC822_RECOMBINE_SPACE_ATOMS); } } } if (i < toks->ntokens && toks->tokens[i].token == '<') { int j; /* RFC822: route-addr = "<" [route] addr-spec ">" */ /* look for the closing '>' and recombine as the address part */ for (j = i; j < toks->ntokens && toks->tokens[j].token != '>'; j++) ; if (addrs) { a_start = i; a_count = j-i; /* if an address is enclosed in <>, leave them out of the the * address value that we return */ if (toks->tokens[a_start].token == '<') { a_start++; a_count--; } address_value = php_rfc822_recombine_tokens(toks, a_start, a_count, PHP_RFC822_RECOMBINE_SPACE_ATOMS| PHP_RFC822_RECOMBINE_IGNORE_COMMENTS| PHP_RFC822_RECOMBINE_INCLUDE_QUOTES); } start_tok = ++j; } else { /* RFC822: addr-spec = local-part "@" domain */ if (addrs) { a_start = start_tok; a_count = i - start_tok; /* if an address is enclosed in <>, leave them out of the the * address value that we return */ if (toks->tokens[a_start].token == '<') { a_start++; a_count--; } address_value = php_rfc822_recombine_tokens(toks, a_start, a_count, PHP_RFC822_RECOMBINE_SPACE_ATOMS| PHP_RFC822_RECOMBINE_IGNORE_COMMENTS| PHP_RFC822_RECOMBINE_INCLUDE_QUOTES); } start_tok = i; } if (addrs && address_value) { /* if no display name has been given, use the address */ if (addrs->addrs[iaddr].name == NULL) { addrs->addrs[iaddr].name = estrdup(address_value); } if (in_group) { if (group_addrs.len) smart_str_appendl(&group_addrs, ",", 1); smart_str_appends(&group_addrs, address_value); efree(address_value); } else { addrs->addrs[iaddr].address = address_value; } address_value = NULL; } if (!in_group) { iaddr++; goto address; } /* still dealing with a group. If we find a ";", that's the end of the group */ if ((start_tok < toks->ntokens && toks->tokens[start_tok].token == ';') || start_tok == toks->ntokens) { /* end of group */ if (addrs) { smart_str_appendl(&group_addrs, ";", 1); smart_str_0(&group_addrs); addrs->addrs[iaddr].address = estrdup(group_addrs.c); group_addrs.len = 0; STR_FREE(addrs->addrs[iaddr].name); addrs->addrs[iaddr].name = php_rfc822_recombine_tokens(toks, group_lbl_start, group_lbl_end - group_lbl_start, PHP_RFC822_RECOMBINE_SPACE_ATOMS); addrs->addrs[iaddr].is_group = 1; } iaddr++; in_group = 0; start_tok++; goto address; } /* look for more mailboxes in this group */ goto mailbox; } PHP_MAILPARSE_API php_rfc822_addresses_t *php_rfc822_parse_address_tokens(php_rfc822_tokenized_t *toks) { php_rfc822_addresses_t *addrs = ecalloc(1, sizeof(php_rfc822_addresses_t)); parse_address_tokens(toks, NULL, &addrs->naddrs); addrs->addrs = addrs->naddrs ? ecalloc(addrs->naddrs, sizeof(php_rfc822_address_t)) : NULL; parse_address_tokens(toks, addrs, &addrs->naddrs); return addrs; } PHP_MAILPARSE_API void php_rfc822_free_addresses(php_rfc822_addresses_t *addrs) { int i; for (i = 0; i < addrs->naddrs; i++) { if (addrs->addrs[i].name) STR_FREE(addrs->addrs[i].name); STR_FREE(addrs->addrs[i].address); } if (addrs->addrs) efree(addrs->addrs); efree(addrs); } void php_rfc822_print_addresses(php_rfc822_addresses_t *addrs) { int i; printf("printing addresses %p\n", addrs); fflush(stdout); for (i = 0; i < addrs->naddrs; i++) { printf("addr %d: name=%s address=%s\n", i, addrs->addrs[i].name, addrs->addrs[i].address); } } void php_rfc822_print_tokens(php_rfc822_tokenized_t *toks) { int i; for (i = 0; i < toks->ntokens; i++) { printf("token %d: token=%d/%c len=%d value=%s\n", i, toks->tokens[i].token, toks->tokens[i].token, toks->tokens[i].valuelen, toks->tokens[i].value); } } PHP_FUNCTION(mailparse_test) { char *header; long header_len; php_rfc822_tokenized_t *toks; php_rfc822_addresses_t *addrs; struct rfc822t *t; int i; if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &header, &header_len) == FAILURE) { RETURN_FALSE; } #if 0 t = mailparse_rfc822t_alloc(header, NULL); for (i = 0; i < t->ntokens; i++) { printf("token %d: token=%d/%c len=%d value=%s\n", i, t->tokens[i].token, t->tokens[i].token, t->tokens[i].len, t->tokens[i].ptr); } mailparse_rfc822t_free(t); printf("--- and now:\n"); #endif toks = php_mailparse_rfc822_tokenize((const char*)header, 1 TSRMLS_CC); php_rfc822_print_tokens(toks); addrs = php_rfc822_parse_address_tokens(toks); php_rfc822_print_addresses(addrs); php_rfc822_free_addresses(addrs); php_rfc822_tokenize_free(toks); } /* * Local variables: * tab-width: 4 * c-basic-offset: 4 * End: * vim600: sw=4 ts=4 fdm=marker syn=c * vim<600: sw=4 ts=4 */