/* ** @(#) $Id: RobotMain.c,v 1.11 1999/03/14 02:21:09 frystyk Exp $ ** ** W3C Webbot can be found at "http://www.w3.org/Robot/" ** ** Copyright 1995-1998 World Wide Web Consortium, (Massachusetts ** Institute of Technology, Institut National de Recherche en ** Informatique et en Automatique, Keio University). All Rights ** Reserved. This program is distributed under the W3C's Software ** Intellectual Property License. This program is distributed in the hope ** that it will be useful, but WITHOUT ANY WARRANTY; without even the ** implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR ** PURPOSE. See W3C License http://www.w3.org/Consortium/Legal/ for more ** details. ** ** Authors: ** HFN Henrik Frystyk Nielsen, (frystyk@w3.org) ** BR Bob Racko ** JP John Punin ** ** History: ** Dec 04 95 First version ** Oct 1998 Split into separate files */ #include "HTRobMan.h" #include "RobotTxt.h" #define SHOW_QUIET(mr) ((mr) && !((mr)->flags & MR_QUIET)) #define SHOW_REAL_QUIET(mr) ((mr) && !((mr)->flags & MR_REAL_QUIET)) /* ------------------------------------------------------------------------- */ /* MAIN PROGRAM */ /* ------------------------------------------------------------------------- */ PRIVATE int printer (const char * fmt, va_list pArgs) { return (vfprintf(stdout, fmt, pArgs)); } PRIVATE int tracer (const char * fmt, va_list pArgs) { return (vfprintf(stderr, fmt, pArgs)); } int main (int argc, char ** argv) { int status = 0; int arg; BOOL cache = NO; /* Use persistent cache */ BOOL flush = NO; /* flush the persistent cache */ char * cache_root = NULL; int cache_size = DEFAULT_CACHE_SIZE; HTChunk * keywords = NULL; /* From command line */ int keycnt = 0; Robot * mr = NULL; Finger * finger = NULL; HTParentAnchor * startAnchor = NULL; /* Starts Mac GUSI socket library */ #ifdef GUSI GUSISetup(GUSIwithSIOUXSockets); GUSISetup(GUSIwithInternetSockets); #endif #ifdef __MWERKS__ /* STR */ InitGraf((Ptr) &qd.thePort); InitFonts(); InitWindows(); InitMenus(); TEInit(); InitDialogs(nil); InitCursor(); SIOUXSettings.asktosaveonclose = false; argc=ccommand(&argv); #endif /* __MWERKS__ */ #ifdef HT_MEMLOG HTMemLog_open(DEFAULT_MEMLOG, 8192, YES); #endif /* Initiate W3C Reference Library with a robot profile */ HTProfile_newRobot(APP_NAME, APP_VERSION); /* Need our own trace and print functions */ HTPrint_setCallback(printer); HTTrace_setCallback(tracer); /* Build a new robot object */ mr = Robot_new(); /* Scan command Line for parameters */ for (arg=1; arglogfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_LOG_FILE; mr->flags |= MR_LOGGING; /* referer log file */ } else if (!strncmp(argv[arg], "-ref", 4)) { mr->reffile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_REFERER_FILE; mr->flags |= MR_LOGGING; /* Not found error log file */ } else if (!strncmp(argv[arg], "-404", 4)) { mr->notfoundfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_NOTFOUND_FILE; mr->flags |= MR_LOGGING; /* reject log file */ } else if (!strncmp(argv[arg], "-rej", 4)) { mr->rejectfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_REJECT_FILE; mr->flags |= MR_LOGGING; /* no alt tags log file */ } else if (!strncmp(argv[arg], "-alt", 4)) { mr->noalttagfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_NOALTTAG_FILE; mr->flags |= MR_LOGGING; /* negotiated resource log file */ } else if (!strncmp(argv[arg], "-neg", 4)) { mr->connegfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_CONNEG_FILE; mr->flags |= MR_LOGGING; /* hit file log */ } else if (!strcmp(argv[arg], "-hit")) { mr->hitfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_HIT_FILE; mr->flags |= MR_DISTRIBUTIONS; /* link relations file log */ } else if (!strcmp(argv[arg], "-rellog")) { mr->relfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_REL_FILE; mr->flags |= MR_DISTRIBUTIONS; /* Specific link relation to look for (only used i also -rellog) */ } else if (!strcmp(argv[arg], "-relation")) { mr->relation = (arg+1 < argc && *argv[arg+1] != '-') ? (HTLinkType) HTAtom_caseFor(argv[++arg]) : NULL; mr->flags |= MR_DISTRIBUTIONS; /* last modified log file */ } else if (!strcmp(argv[arg], "-lm")) { mr->lmfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_LM_FILE; mr->flags |= MR_DISTRIBUTIONS; /* title log file */ } else if (!strcmp(argv[arg], "-title")) { mr->titlefile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_TITLE_FILE; mr->flags |= MR_DISTRIBUTIONS; /* mediatype distribution log file */ } else if (!strncmp(argv[arg], "-for", 4)) { mr->mtfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_FORMAT_FILE; mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS); /* charset distribution log file */ } else if (!strncmp(argv[arg], "-char", 5)) { mr->charsetfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_CHARSET_FILE; mr->flags |= (MR_KEEP_META | MR_DISTRIBUTIONS); /* rule file */ } else if (!strcmp(argv[arg], "-r")) { mr->rules = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_RULE_FILE; /* Don't follow HTML META tags with robot information */ } else if (!strcmp(argv[arg], "-nometatags")) { mr->flags |= MR_NOMETATAGS; /* output filename */ } else if (!strcmp(argv[arg], "-o")) { mr->outputfile = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_OUTPUT_FILE; /* URI prefix */ } else if (!strcmp(argv[arg], "-prefix")) { char * prefix = NULL; prefix = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_PREFIX; if (*prefix && *prefix != '*') { StrAllocCopy(mr->prefix, prefix); StrAllocCat(mr->prefix, "*"); } /* timeout -- Change the default request timeout */ } else if (!strcmp(argv[arg], "-timeout")) { int timeout = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : DEFAULT_TIMEOUT; if (timeout > 1) mr->timer = timeout*MILLIES; /* wait -- Change the default pwait time */ /* This is new */ } else if (!strcmp(argv[arg], "-wait")) { int waits = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : 0; if (waits > 0) mr->waits = waits; /* Force no pipelined requests */ } else if (!strcmp(argv[arg], "-nopipe")) { HTTP_setConnectionMode(HTTP_11_NO_PIPELINING); /* Stream write flush delay in ms */ } else if (!strcmp(argv[arg], "-delay")) { int delay = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : DEFAULT_DELAY; HTHost_setDefaultWriteDelay(delay); /* Start the persistent cache */ } else if (!strcmp(argv[arg], "-cache")) { cache = YES; /* Determine the cache root */ } else if (!strcmp(argv[arg], "-cacheroot")) { cache_root = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : NULL; /* Persistent cache flush */ } else if (!strcmp(argv[arg], "-flush")) { flush = YES; /* Do a cache validation */ } else if (!strcmp(argv[arg], "-validate")) { mr->flags |= MR_VALIDATE; } else if (!strcmp(argv[arg], "-cache_size")) { cache_size = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : DEFAULT_CACHE_SIZE; /* Do an end-to-end cache-validation */ } else if (!strcmp(argv[arg], "-endvalidate")) { mr->flags |= MR_END_VALIDATE; /* preemptive or non-preemptive access */ } else if (!strcmp(argv[arg], "-single")) { mr->flags |= MR_PREEMPTIVE; /* test inlined images */ } else if (!strcmp(argv[arg], "-img")) { mr->flags |= MR_IMG; /* load inlined images */ } else if (!strcmp(argv[arg], "-saveimg")) { mr->flags |= (MR_IMG | MR_SAVE); /* URI prefix for inlined images */ } else if (!strcmp(argv[arg], "-imgprefix")) { char * prefix = NULL; prefix = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_IMG_PREFIX; if (*prefix && *prefix!='*') { StrAllocCopy(mr->img_prefix, prefix); StrAllocCat(mr->img_prefix, "*"); } /* load anchors */ } else if (!strcmp(argv[arg], "-link") || !strcmp(argv[arg], "-depth")) { mr->flags |= MR_LINK; mr->depth = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : DEFAULT_DEPTH; /* load fixed number of anchors */ } else if (!strcmp(argv[arg], "-ndoc")) { mr->ndoc = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : -1 ; /* Output start and end time */ } else if (!strcmp(argv[arg], "-ss")) { mr->flags |= MR_TIME; /* print version and exit */ } else if (!strcmp(argv[arg], "-version")) { VersionInfo(); Cleanup(mr, 0); /* run in BFS mode */ } else if (!strcmp(argv[arg], "-bfs")) { mr->flags |= MR_BFS; /* run in quiet mode */ } else if (!strcmp(argv[arg], "-q")) { mr->flags |= MR_QUIET; /* run in really quiet mode */ } else if (!strcmp(argv[arg], "-Q")) { mr->flags |= MR_REAL_QUIET; /* run in redirection mode */ } else if (!strcmp(argv[arg], "-redir")) { mr->flags |= MR_REDIR; mr->redir_code = (arg+1 < argc && *argv[arg+1] != '-') ? atoi(argv[++arg]) : 0; #ifdef WWWTRACE /* trace flags */ } else if (!strncmp(argv[arg], "-v", 2)) { HTSetTraceMessageMask(argv[arg]+2); #endif #ifdef HT_POSIX_REGEX /* If we can link against a POSIX regex library */ } else if (!strncmp(argv[arg], "-inc", 4)) { if (arg+1 < argc && *argv[arg+1] != '-') { mr->include = get_regtype(mr, argv[++arg], W3C_DEFAULT_REGEX_FLAGS); } } else if (!strncmp(argv[arg], "-exc", 4)) { if (arg+1 < argc && *argv[arg+1] != '-') { mr->exclude = get_regtype(mr, argv[++arg], W3C_DEFAULT_REGEX_FLAGS); } } else if (!strncmp(argv[arg], "-check", 6)) { if (arg+1 < argc && *argv[arg+1] != '-') { mr->check = get_regtype(mr, argv[++arg], W3C_DEFAULT_REGEX_FLAGS); } } else if (!strcmp(argv[arg], "-norobotstxt")) { mr->flags |= MR_NOROBOTSTXT; #endif #ifdef HT_MYSQL /* If we can link against a MYSQL database library */ } else if (!strncmp(argv[arg], "-sqldb", 5)) { mr->sqldb = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_SQL_DB; } else if (!strncmp(argv[arg], "-sqlclearlinks", 10)) { mr->sqlflags |= HTSQLLOG_CLEAR_LINKS_TABLE; } else if (!strncmp(argv[arg], "-sqlclearrequests", 12)) { mr->sqlflags |= HTSQLLOG_CLEAR_REQUESTS_TABLE; } else if (!strncmp(argv[arg], "-sqlclearresources", 12)) { mr->sqlflags |= HTSQLLOG_CLEAR_RESOURCES_TABLE; } else if (!strncmp(argv[arg], "-sqlclearuris", 10)) { mr->sqlflags |= HTSQLLOG_CLEAR_URIS_TABLE; } else if (!strncmp(argv[arg], "-sqlexternals", 5)) { mr->sqlexternals = YES; } else if (!strncmp(argv[arg], "-sqlpassword", 5)) { mr->sqlpw = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_SQL_PW; } else if (!strncmp(argv[arg], "-sqlrelative", 5)) { mr->sqlrelative = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : NULL; } else if (!strncmp(argv[arg], "-sqlserver", 5)) { mr->sqlserver = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_SQL_SERVER; } else if (!strncmp(argv[arg], "-sqluser", 5)) { mr->sqluser = (arg+1 < argc && *argv[arg+1] != '-') ? argv[++arg] : DEFAULT_SQL_USER; #endif } else { if (SHOW_REAL_QUIET(mr)) HTPrint("Bad Argument (%s)\n", argv[arg]); } } else { /* If no leading `-' then check for URL or keywords */ if (!keycnt) { HyperDoc *hd; /* This is new variable */ mr->furl = HTParse(argv[arg], mr->cwd, PARSE_ALL); startAnchor = HTAnchor_parent(HTAnchor_findAddress(mr->furl)); hd = HyperDoc_new(mr, startAnchor, 0); hd->method = METHOD_GET; keycnt = 1; } else { /* Check for successive keyword arguments */ char *escaped = HTEscape(argv[arg], URL_XALPHAS); if (keycnt++ <= 1) keywords = HTChunk_new(128); else HTChunk_putc(keywords, ' '); HTChunk_puts(keywords, HTStrip(escaped)); HT_FREE(escaped); } } } if (!keycnt) { VersionInfo(); Cleanup(mr, 0); } if (mr->depth != DEFAULT_DEPTH && (mr->prefix == NULL || *mr->prefix == '*')) { if (SHOW_REAL_QUIET(mr)) HTPrint("A depth of more than 0 requires that you also specify a URI prefix.\n", mr->depth); Cleanup(mr, -1); } /* Testing that HTPrint is working */ if (mr->flags & MR_TIME) { if (SHOW_REAL_QUIET(mr)) { time_t local = time(NULL); HTPrint("Welcome to the W3C mini Robot version %s - started on %s\n", APP_VERSION, HTDateTimeStr(&local, YES)); } } /* Rule file specified? */ if (mr->rules) { char * rules = HTParse(mr->rules, mr->cwd, PARSE_ALL); if (!HTLoadRulesAutomatically(rules)) if (SHOW_REAL_QUIET(mr)) HTPrint("Can't access rules\n"); HT_FREE(rules); } /* Output file specified? */ if (mr->outputfile) { if ((mr->output = fopen(mr->outputfile, "wb")) == NULL) { if (SHOW_REAL_QUIET(mr)) HTPrint("Can't open `%s'\n", mr->outputfile); mr->output = OUTPUT; } } /* This is new */ if ((mr->cdepth = (int *) HT_CALLOC(mr->depth+2, sizeof(int)))==NULL) HT_OUTOFMEM("main"); /* Should we use persistent cache? */ if (cache) { HTCacheInit(cache_root, cache_size); /* Should we start by flushing? */ if (flush) HTCache_flushAll(); } /* SQL Log specified? */ #ifdef HT_MYSQL if (mr->sqlserver) { if ((mr->sqllog = HTSQLLog_open(mr->sqlserver, mr->sqluser ? mr->sqluser : DEFAULT_SQL_USER, mr->sqlpw ? mr->sqlpw : DEFAULT_SQL_PW, mr->sqldb ? mr->sqldb : DEFAULT_SQL_DB, mr->sqlflags)) != NULL) { if (mr->sqlrelative) HTSQLLog_makeRelativeTo(mr->sqllog, mr->sqlrelative); } } #endif /* CLF Log file specified? */ if (mr->logfile) { mr->log = HTLog_open(mr->logfile, YES, YES); if (mr->log) HTNet_addAfter(HTLogFilter, NULL, mr->log, HT_ALL, HT_FILTER_LATE); } /* Referer Log file specified? */ if (mr->reffile) { mr->ref = HTLog_open(mr->reffile, YES, YES); if (mr->ref) HTNet_addAfter(HTRefererFilter, NULL, mr->ref, HT_ALL, HT_FILTER_LATE); } /* Not found error log specified? */ if (mr->notfoundfile) { mr->notfound = HTLog_open(mr->notfoundfile, YES, YES); if (mr->notfound) HTNet_addAfter(HTRefererFilter, NULL, mr->notfound, -404, HT_FILTER_LATE); } /* Check that the redirection code is valid */ if (mr->flags & MR_REDIR) { BOOL isredir = NO; if (mr->redir_code == HT_PERM_REDIRECT || mr->redir_code == 0) { HTNet_addAfter(redirection_handler, "http://*" , NULL, HT_PERM_REDIRECT, HT_FILTER_LATE); isredir = YES; } if (mr->redir_code == HT_TEMP_REDIRECT || mr->redir_code == 0) { HTNet_addAfter(redirection_handler, "http://*", NULL, HT_TEMP_REDIRECT, HT_FILTER_LATE); isredir = YES; } if (mr->redir_code == HT_FOUND || mr->redir_code == 0) { HTNet_addAfter(redirection_handler, "http://*", NULL, HT_FOUND, HT_FILTER_LATE); isredir = YES; } if (mr->redir_code == HT_SEE_OTHER || mr->redir_code == 0) { HTNet_addAfter(redirection_handler, "http://*", NULL, HT_SEE_OTHER, HT_FILTER_LATE); isredir = YES; } if (!isredir) { if (SHOW_REAL_QUIET(mr)) HTPrint("%d is not a valid redirection code\n", mr->redir_code); Cleanup(mr, -1); } } /* Negotiated resource log specified? */ if (mr->connegfile) mr->conneg = HTLog_open(mr->connegfile, YES, YES); /* No alt tags log file specified? */ if (mr->noalttagfile) mr->noalttag = HTLog_open(mr->noalttagfile, YES, YES); /* Reject Log file specified? */ if (mr->rejectfile) mr->reject = HTLog_open(mr->rejectfile, YES, YES); #ifdef HT_POSIX_REGEX if(!(mr->flags & MR_NOROBOTSTXT)) { char *ruri = HTParse(ROBOTS_TXT, mr->furl, PARSE_ALL); char *robot_str = get_robots_txt(ruri); char *reg_exp_robot = robot_str ? scan_robots_txt(robot_str,APP_NAME) : NULL; if (SHOW_REAL_QUIET(mr)) HTPrint("robots.txt uri is `%s'\n", ruri); if(robot_str) HT_FREE(robot_str); if(reg_exp_robot) { mr->exc_robot = get_regtype(mr, reg_exp_robot, W3C_DEFAULT_REGEX_FLAGS); HT_FREE(reg_exp_robot); } HT_FREE(ruri); } #endif /* Add our own HTML HText functions */ Robot_registerHTMLParser(); /* Register our own terminate filter */ HTNet_addAfter(terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST); /* If doing breath first search */ if (mr->flags & MR_BFS) HTNet_addAfter(bfs_terminate_handler, NULL, NULL, HT_ALL, HT_FILTER_LAST); /* Setting event timeout */ HTHost_setEventTimeout(mr->timer); mr->time = HTGetTimeInMillis(); /* Start the request */ finger = Finger_new(mr, startAnchor, METHOD_GET); /* ** Make sure that the first request is flushed immediately and not ** buffered in the output buffer */ HTRequest_setFlush(finger->request, YES); /* ** Check whether we should do some kind of cache validation on ** the load */ if (mr->flags & MR_VALIDATE) HTRequest_setReloadMode(finger->request, HT_CACHE_VALIDATE); if (mr->flags & MR_END_VALIDATE) HTRequest_setReloadMode(finger->request, HT_CACHE_END_VALIDATE); /* ** Now do the load */ if (mr->flags & MR_PREEMPTIVE) HTRequest_setPreemptive(finger->request, YES); if (keywords) /* Search */ status = HTSearchAnchor(keywords, (HTAnchor *)startAnchor, finger->request); else status = HTLoadAnchor((HTAnchor *)startAnchor, finger->request); if (keywords) HTChunk_delete(keywords); if (status != YES) { if (SHOW_REAL_QUIET(mr)) HTPrint("Can't access resource\n"); Cleanup(mr, -1); } /* Go into the event loop... */ if((mr->flags & MR_PREEMPTIVE) && (mr->flags & MR_BFS)) Serving_queue(mr); else HTEventList_loop(finger->request); /* Only gets here if event loop fails */ Cleanup(mr, 0); return 0; }