/* Generated by re2c 0.5 on Sun Mar 7 13:35:34 2004 */
#line 1 "/home/wez/src/php/pecl/mailparse/php_mailparse_rfc822.re"
/*
+----------------------------------------------------------------------+
| PHP Version 4 |
+----------------------------------------------------------------------+
| Copyright (c) 1997-2004 The PHP Group |
+----------------------------------------------------------------------+
| This source file is subject to version 2.02 of the PHP license, |
| that is bundled with this package in the file LICENSE, and is |
| available at through the world-wide-web at |
| http://www.php.net/license/2_02.txt. |
| If you did not receive a copy of the PHP license and are unable to |
| obtain it through the world-wide-web, please send a note to |
| license@php.net so we can mail you a copy immediately. |
+----------------------------------------------------------------------+
| Author: Wez Furlong <wez@thebrainroom.com> |
+----------------------------------------------------------------------+
*/
/* $Id: php_mailparse_rfc822.c,v 1.14 2004/12/28 18:55:20 wez Exp $ */
#include "php.h"
#include "php_mailparse.h"
#include "php_mailparse_rfc822.h"
#include "ext/standard/php_string.h"
#include "ext/standard/php_smart_str.h"
#line 39
#line 48
#define YYFILL(n) if (YYCURSOR == YYLIMIT) goto stop
#define YYCTYPE unsigned char
#define YYCURSOR p
#define YYLIMIT q
#define YYMARKER r
#define DEBUG_RFC822_SCANNER 0
#if DEBUG_RFC822_SCANNER
# define DBG_STATE(lbl) printf(lbl " %d:%c %d:%c\n", *YYCURSOR, *YYCURSOR, *start, *start)
#else
# define DBG_STATE(lbl)
#endif
#define ADD_ATOM_TOKEN() do { if (tokens) { tokens->token = *start; tokens->value = start; tokens->valuelen = 1; tokens++; } ++*ntokens; } while (0)
#define REPORT_ERR(msg) do { if (report_errors) zend_error(E_WARNING, "input is not rfc822 compliant: %s", msg); } while(0)
/* Tokenize a header. tokens may be NULL, in which case the number of tokens are
counted, allowing the caller to allocate enough room */
static void tokenize(const char *header, php_rfc822_token_t *tokens, int *ntokens, int report_errors TSRMLS_DC)
{
register const char *p, *q, *start;
int in_bracket = 0;
/* NB: parser assumes that the header has two bytes of NUL terminator */
YYCURSOR = header;
YYLIMIT = YYCURSOR + strlen(YYCURSOR) + 1;
*ntokens = 0;
state_ground:
start = YYCURSOR;
#if DEBUG_RFC822_SCANNER
printf("ground: start=%p limit=%p cursor=%p: [%d] %s\n", start, YYLIMIT, YYCURSOR, *YYCURSOR, YYCURSOR);
#endif
{
YYCTYPE yych;
unsigned int yyaccept;
static unsigned char yybm[] = {
0, 192, 192, 192, 192, 192, 192, 192,
192, 96, 96, 192, 192, 96, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
96, 64, 0, 192, 192, 64, 192, 192,
64, 64, 192, 192, 64, 192, 64, 64,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 64, 64, 64, 64, 64, 64,
64, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 64, 192, 64, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
192, 192, 192, 192, 192, 192, 192, 192,
};
goto yy0;
yy1: ++YYCURSOR;
yy0:
if((YYLIMIT - YYCURSOR) < 2) YYFILL(2);
yych = *YYCURSOR;
if(yybm[0+yych] & 32) goto yy4;
if(yych <= '-'){
if(yych <= '%'){
if(yych <= '!'){
if(yych <= '\000') goto yy2;
if(yych <= ' ') goto yy21;
goto yy19;
} else {
if(yych <= '"') goto yy12;
if(yych <= '$') goto yy21;
goto yy19;
}
} else {
if(yych <= ')'){
if(yych <= '\'') goto yy21;
if(yych <= '(') goto yy10;
goto yy7;
} else {
if(yych == ',') goto yy19;
goto yy21;
}
}
} else {
if(yych <= '>'){
if(yych <= ';'){
if(yych <= '/') goto yy19;
if(yych <= '9') goto yy21;
goto yy19;
} else {
if(yych <= '<') goto yy15;
if(yych <= '=') goto yy19;
goto yy17;
}
} else {
if(yych <= '['){
if(yych <= '@') goto yy19;
if(yych <= 'Z') goto yy21;
goto yy19;
} else {
if(yych <= '\\') goto yy9;
if(yych <= ']') goto yy19;
goto yy21;
}
}
}
yy2: yych = *++YYCURSOR;
yy3:
#line 88
{ goto stop; }
yy4: ++YYCURSOR;
if(YYLIMIT == YYCURSOR) YYFILL(1);
yych = *YYCURSOR;
yy5: if(yybm[0+yych] & 32) goto yy4;
yy6:
#line 89
{ DBG_STATE("SPACE"); goto state_ground; }
yy7: yych = *++YYCURSOR;
yy8:
#line 90
{ REPORT_ERR("token not valid in ground state"); goto state_ground; }
yy9: yych = *++YYCURSOR;
if(yybm[0+yych] & 128) goto yy21;
goto yy8;
yy10: yych = *++YYCURSOR;
yy11:
#line 91
{ DBG_STATE("START COMMENT");
if (tokens) {
tokens->token = '(';
tokens->value = start;
tokens->valuelen = 0;
}
goto state_comment;
}
yy12: ++YYCURSOR;
if(YYLIMIT == YYCURSOR) YYFILL(1);
yych = *YYCURSOR;
yy13: if(yybm[0+yych] & 64) goto yy12;
if(yych >= '\001') goto yy26;
yy14:yy15: yych = *++YYCURSOR;
if(yych == '>') goto yy24;
yy16:
#line 123
{ DBG_STATE("LANGLE");
if (in_bracket) {
REPORT_ERR("already in < bracket");
goto state_ground;
}
in_bracket = 1;
ADD_ATOM_TOKEN();
goto state_ground;
}
yy17: yych = *++YYCURSOR;
yy18:
#line 132
{ DBG_STATE("RANGLE");
if (!in_bracket) {
REPORT_ERR("not in < bracket");
goto state_ground;
}
in_bracket = 0;
ADD_ATOM_TOKEN();
goto state_ground;
}
yy19: yych = *++YYCURSOR;
yy20:
#line 141
{ DBG_STATE("ATOM"); ADD_ATOM_TOKEN(); goto state_ground; }
yy21: ++YYCURSOR;
if(YYLIMIT == YYCURSOR) YYFILL(1);
yych = *YYCURSOR;
yy22: if(yybm[0+yych] & 128) goto yy21;
yy23:
#line 142
{ DBG_STATE("ANY");
if (tokens) {
tokens->token = 0;
tokens->valuelen = YYCURSOR - start;
tokens->value = start;
tokens++;
}
++*ntokens;
goto state_ground;
}
yy24: yych = *++YYCURSOR;
yy25:
#line 110
{ DBG_STATE("NULL <>");
ADD_ATOM_TOKEN();
if (tokens) {
tokens->token = 0;
tokens->value = "";
tokens->valuelen = 0;
tokens++;
}
++*ntokens;
start++;
ADD_ATOM_TOKEN();
goto state_ground;
}
yy26: yych = *++YYCURSOR;
yy27:
#line 99
{ DBG_STATE("QUOTE STRING");
if (tokens) {
tokens->token = '"';
tokens->value = start + 1;
tokens->valuelen = YYCURSOR - start - 2;
tokens++;
}
++*ntokens;
goto state_ground;
}
}
#line 152
state_comment:
{
int comment_depth = 1;
while (1) {
if (*YYCURSOR == 0) {
/* unexpected end of header */
REPORT_ERR("unexpected end of header");
/* fake a quoted string for this last token */
if (tokens)
tokens->token = '"';
++*ntokens;
return;
} else if (*YYCURSOR == '(') {
comment_depth++;
} else if (*YYCURSOR == ')' && --comment_depth == 0) {
/* end of nested comment sequence */
YYCURSOR++;
if (tokens)
tokens->valuelen++;
break;
} else if (*YYCURSOR == '\\' && YYCURSOR[1]) {
YYCURSOR++;
if (tokens)
tokens->valuelen++;
}
YYCURSOR++;
}
if (tokens) {
tokens->valuelen = YYCURSOR - tokens->value;
tokens++;
}
++*ntokens;
goto state_ground;
}
stop:
#if DEBUG_RFC822_SCANNER
printf("STOPing parser ntokens=%d YYCURSOR=%p YYLIMIT=%p start=%p cursor=[%d] %s start=%s\n", *ntokens,
YYCURSOR, YYLIMIT, start, *YYCURSOR, YYCURSOR, start);
#else
;
#endif
}
PHP_MAILPARSE_API php_rfc822_tokenized_t *php_mailparse_rfc822_tokenize(const char *header, int report_errors TSRMLS_DC)
{
php_rfc822_tokenized_t *toks = ecalloc(1, sizeof(php_rfc822_tokenized_t));
int len = strlen(header);
toks->buffer = emalloc(len + 2);
strcpy(toks->buffer, header);
toks->buffer[len] = 0;
toks->buffer[len+1] = 0; /* mini hack: the parser sometimes relies in this */
tokenize(toks->buffer, NULL, &toks->ntokens, report_errors TSRMLS_CC);
toks->tokens = toks->ntokens ? ecalloc(toks->ntokens, sizeof(php_rfc822_token_t)) : NULL;
tokenize(toks->buffer, toks->tokens, &toks->ntokens, report_errors TSRMLS_CC);
return toks;
}
PHP_MAILPARSE_API void php_rfc822_tokenize_free(php_rfc822_tokenized_t *toks)
{
if (toks->tokens)
efree(toks->tokens);
efree(toks->buffer);
efree(toks);
}
PHP_MAILPARSE_API char *php_rfc822_recombine_tokens(php_rfc822_tokenized_t *toks, int first_token, int n_tokens, int flags)
{
char *ret = NULL;
int i, upper, last_was_atom = 0, this_is_atom = 0, tok_equiv;
size_t len = 1; /* for the NUL terminator */
upper = first_token + n_tokens;
if (upper > toks->ntokens)
upper = toks->ntokens;
for (i = first_token; i < upper; i++, last_was_atom = this_is_atom) {
tok_equiv = toks->tokens[i].token;
if (tok_equiv == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES)
tok_equiv = '"';
if (flags & PHP_RFC822_RECOMBINE_IGNORE_COMMENTS && tok_equiv == '(')
continue;
if (flags & PHP_RFC822_RECOMBINE_COMMENTS_ONLY && tok_equiv != '(' && !(toks->tokens[i].token == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES))
continue;
this_is_atom = php_rfc822_token_is_atom(toks->tokens[i].token);
if (this_is_atom && last_was_atom && flags & PHP_RFC822_RECOMBINE_SPACE_ATOMS)
len++; /* allow room for a space */
if (flags & PHP_RFC822_RECOMBINE_INCLUDE_QUOTES && tok_equiv == '"')
len += 2;
len += toks->tokens[i].valuelen;
}
last_was_atom = this_is_atom = 0;
ret = emalloc(len);
for (i = first_token, len = 0; i < upper; i++, last_was_atom = this_is_atom) {
const char *tokvalue;
int toklen;
tok_equiv = toks->tokens[i].token;
if (tok_equiv == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES)
tok_equiv = '"';
if (flags & PHP_RFC822_RECOMBINE_IGNORE_COMMENTS && tok_equiv == '(')
continue;
if (flags & PHP_RFC822_RECOMBINE_COMMENTS_ONLY && tok_equiv != '(' && !(toks->tokens[i].token == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES))
continue;
tokvalue = toks->tokens[i].value;
toklen = toks->tokens[i].valuelen;
this_is_atom = php_rfc822_token_is_atom(toks->tokens[i].token);
if (this_is_atom && last_was_atom && flags & PHP_RFC822_RECOMBINE_SPACE_ATOMS) {
ret[len] = ' ';
len++;
}
if (flags & PHP_RFC822_RECOMBINE_INCLUDE_QUOTES && tok_equiv == '"')
ret[len++] = '"';
if (toks->tokens[i].token == '(' && flags & PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES) {
/* don't include ( and ) in the output string */
tokvalue++;
toklen -= 2;
}
memcpy(ret + len, tokvalue, toklen);
len += toklen;
if (flags & PHP_RFC822_RECOMBINE_INCLUDE_QUOTES && tok_equiv == '"')
ret[len++] = '"';
}
ret[len] = 0;
if (flags & PHP_RFC822_RECOMBINE_STRTOLOWER)
php_strtolower(ret, len);
return ret;
}
static void parse_address_tokens(php_rfc822_tokenized_t *toks,
php_rfc822_addresses_t *addrs, int *naddrs)
{
int start_tok = 0, iaddr = 0, i, in_group = 0, group_lbl_start, group_lbl_end;
int a_start, a_count; /* position and count for address part of a name */
smart_str group_addrs = { 0, };
char *address_value = NULL;
address: /* mailbox / group */
if (start_tok >= toks->ntokens) {
/* the end */
*naddrs = iaddr;
smart_str_free(&group_addrs);
return;
}
/* look ahead to determine if we are dealing with a group */
for (i = start_tok; i < toks->ntokens; i++)
if (toks->tokens[i].token != 0 && toks->tokens[i].token != '"')
break;
if (i < toks->ntokens && toks->tokens[i].token == ':') {
/* it's a group */
in_group = 1;
group_lbl_start = start_tok;
group_lbl_end = i;
/* we want the address for the group to include the leading ":" and the trailing ";" */
start_tok = i;
}
mailbox: /* addr-spec / phrase route-addr */
if (start_tok >= toks->ntokens) {
/* the end */
*naddrs = iaddr;
smart_str_free(&group_addrs);
return;
}
/* skip spurious commas */
while (start_tok < toks->ntokens && (toks->tokens[start_tok].token == ','
|| toks->tokens[start_tok].token == ';'))
start_tok++;
/* look ahead: if we find a '<' before we find an '@', we are dealing with
a route-addr, otherwise we have an addr-spec */
for (i = start_tok; i < toks->ntokens && toks->tokens[i].token != ';'
&& toks->tokens[i].token != ',' && toks->tokens[i].token != '<'; i++)
;
/* the stuff from start_tok to i - 1 is the display name part */
if (addrs && !in_group && i - start_tok > 0) {
int j, has_comments = 0, has_strings = 0;
switch(toks->tokens[i].token) {
case ';': case ',': case '<':
addrs->addrs[iaddr].name = php_rfc822_recombine_tokens(toks, start_tok, i - start_tok,
PHP_RFC822_RECOMBINE_SPACE_ATOMS);
break;
default:
/* it's only the display name if there are quoted strings or comments in there */
for (j = start_tok; j < i; j++) {
if (toks->tokens[j].token == '(')
has_comments = 1;
if (toks->tokens[j].token == '"')
has_strings = 1;
}
if (has_comments && !has_strings) {
addrs->addrs[iaddr].name = php_rfc822_recombine_tokens(toks, start_tok,
i - start_tok,
PHP_RFC822_RECOMBINE_SPACE_ATOMS | PHP_RFC822_RECOMBINE_COMMENTS_ONLY
| PHP_RFC822_RECOMBINE_COMMENTS_TO_QUOTES
);
} else if (has_strings) {
addrs->addrs[iaddr].name = php_rfc822_recombine_tokens(toks, start_tok, i - start_tok,
PHP_RFC822_RECOMBINE_SPACE_ATOMS);
}
}
}
if (i < toks->ntokens && toks->tokens[i].token == '<') {
int j;
/* RFC822: route-addr = "<" [route] addr-spec ">" */
/* look for the closing '>' and recombine as the address part */
for (j = i; j < toks->ntokens && toks->tokens[j].token != '>'; j++)
;
if (addrs) {
a_start = i;
a_count = j-i;
/* if an address is enclosed in <>, leave them out of the the
* address value that we return */
if (toks->tokens[a_start].token == '<') {
a_start++;
a_count--;
}
address_value = php_rfc822_recombine_tokens(toks, a_start, a_count,
PHP_RFC822_RECOMBINE_SPACE_ATOMS|
PHP_RFC822_RECOMBINE_IGNORE_COMMENTS|
PHP_RFC822_RECOMBINE_INCLUDE_QUOTES);
}
start_tok = ++j;
} else {
/* RFC822: addr-spec = local-part "@" domain */
if (addrs) {
a_start = start_tok;
a_count = i - start_tok;
/* if an address is enclosed in <>, leave them out of the the
* address value that we return */
if (toks->tokens[a_start].token == '<') {
a_start++;
a_count--;
}
address_value = php_rfc822_recombine_tokens(toks, a_start, a_count,
PHP_RFC822_RECOMBINE_SPACE_ATOMS|
PHP_RFC822_RECOMBINE_IGNORE_COMMENTS|
PHP_RFC822_RECOMBINE_INCLUDE_QUOTES);
}
start_tok = i;
}
if (addrs && address_value) {
/* if no display name has been given, use the address */
if (addrs->addrs[iaddr].name == NULL) {
addrs->addrs[iaddr].name = estrdup(address_value);
}
if (in_group) {
if (group_addrs.len)
smart_str_appendl(&group_addrs, ",", 1);
smart_str_appends(&group_addrs, address_value);
efree(address_value);
} else {
addrs->addrs[iaddr].address = address_value;
}
address_value = NULL;
}
if (!in_group) {
iaddr++;
goto address;
}
/* still dealing with a group. If we find a ";", that's the end of the group */
if ((start_tok < toks->ntokens && toks->tokens[start_tok].token == ';') || start_tok == toks->ntokens) {
/* end of group */
if (addrs) {
smart_str_appendl(&group_addrs, ";", 1);
smart_str_0(&group_addrs);
addrs->addrs[iaddr].address = estrdup(group_addrs.c);
group_addrs.len = 0;
STR_FREE(addrs->addrs[iaddr].name);
addrs->addrs[iaddr].name = php_rfc822_recombine_tokens(toks, group_lbl_start,
group_lbl_end - group_lbl_start,
PHP_RFC822_RECOMBINE_SPACE_ATOMS);
addrs->addrs[iaddr].is_group = 1;
}
iaddr++;
in_group = 0;
start_tok++;
goto address;
}
/* look for more mailboxes in this group */
goto mailbox;
}
PHP_MAILPARSE_API php_rfc822_addresses_t *php_rfc822_parse_address_tokens(php_rfc822_tokenized_t *toks)
{
php_rfc822_addresses_t *addrs = ecalloc(1, sizeof(php_rfc822_addresses_t));
parse_address_tokens(toks, NULL, &addrs->naddrs);
addrs->addrs = addrs->naddrs ? ecalloc(addrs->naddrs, sizeof(php_rfc822_address_t)) : NULL;
parse_address_tokens(toks, addrs, &addrs->naddrs);
return addrs;
}
PHP_MAILPARSE_API void php_rfc822_free_addresses(php_rfc822_addresses_t *addrs)
{
int i;
for (i = 0; i < addrs->naddrs; i++) {
if (addrs->addrs[i].name)
STR_FREE(addrs->addrs[i].name);
STR_FREE(addrs->addrs[i].address);
}
if (addrs->addrs)
efree(addrs->addrs);
efree(addrs);
}
void php_rfc822_print_addresses(php_rfc822_addresses_t *addrs)
{
int i;
printf("printing addresses %p\n", addrs); fflush(stdout);
for (i = 0; i < addrs->naddrs; i++) {
printf("addr %d: name=%s address=%s\n", i, addrs->addrs[i].name, addrs->addrs[i].address);
}
}
void php_rfc822_print_tokens(php_rfc822_tokenized_t *toks)
{
int i;
for (i = 0; i < toks->ntokens; i++) {
printf("token %d: token=%d/%c len=%d value=%s\n", i, toks->tokens[i].token, toks->tokens[i].token,
toks->tokens[i].valuelen, toks->tokens[i].value);
}
}
PHP_FUNCTION(mailparse_test)
{
char *header;
long header_len;
php_rfc822_tokenized_t *toks;
php_rfc822_addresses_t *addrs;
struct rfc822t *t;
int i;
if (zend_parse_parameters(ZEND_NUM_ARGS() TSRMLS_CC, "s", &header, &header_len) == FAILURE) {
RETURN_FALSE;
}
#if 0
t = mailparse_rfc822t_alloc(header, NULL);
for (i = 0; i < t->ntokens; i++) {
printf("token %d: token=%d/%c len=%d value=%s\n", i, t->tokens[i].token, t->tokens[i].token,
t->tokens[i].len, t->tokens[i].ptr);
}
mailparse_rfc822t_free(t);
printf("--- and now:\n");
#endif
toks = php_mailparse_rfc822_tokenize((const char*)header, 1 TSRMLS_CC);
php_rfc822_print_tokens(toks);
addrs = php_rfc822_parse_address_tokens(toks);
php_rfc822_print_addresses(addrs);
php_rfc822_free_addresses(addrs);
php_rfc822_tokenize_free(toks);
}
/*
* Local variables:
* tab-width: 4
* c-basic-offset: 4
* End:
* vim600: sw=4 ts=4 fdm=marker syn=c
* vim<600: sw=4 ts=4
*/
syntax highlighted by Code2HTML, v. 0.9.1