#! /usr/bin/env python # # Part of the A-A-P project: File type detection module # Copyright (C) 2002-2003 Stichting NLnet Labs # Permission to copy and use this file is specified in the file COPYING. # If this file is missing you can find it here: http://www.a-a-p.org/COPYING # This module detects the type of a file. # It can be run as a separate program or called from Python. # Many types are recognized by default. More types can be added dynamically. # See the Aap reference manual for an explanation. # # # EXTERNAL INTERFACE: # # ft_detect(fname [, ignore] [, recdict]) # Detects the type of file "fname". # # ft_check_dir(dir [, errmsg] [, recdict]) # Scan directory "dir" for "*.afd" files, which are # loaded with ft_read_file(). # # ft_read_file(fname [, recdict]) # Read file "fname" for detection rules. # # ft_add_rules(str, lnum [, recdict]) # Add file type detection rules from "str". See # the Aap reference manual for the syntax. # # ft_known(type) Returns True if "type" is a known filetype, False # otherwise. # # ft_declare(type) Declare "type" to be a known filetype. # import string import os.path import sys import import_re # import the re module in a special way import glob from Util import * # Make a copy of the recdict after these imports, so that they can be used when # executing Python snippets. exec_recdict = globals().copy() # Set to non-zero when run as a program. _run_as_program = 0 # # The default list of detected file types by suffix. # _def_suffix_list = [ ("aap", "aap"), ("abc", "abc"), ("abl", "abel"), ("wrm", "acedb"), ("ada", "ada"), ("adb", "ada"), ("ads", "ada"), ("afd", "afd"), ("tdf", "ahdl"), ("aml", "aml"), ("run", "ampl"), ("a", "asm"), ("asm", "asm"), ("lst", "asm"), ("mac", "asm"), ("s", "asm"), ("asn", "asn"), ("asn1", "asn"), ("asa", "aspvbs"), ("as", "atlas"), ("atl", "atlas"), ("ave", "ave"), ("awk", "awk"), ("imp", "b"), ("mch", "b"), ("ref", "b"), ("bc", "bc"), ("bdf", "bdf"), ("bib", "bib"), ("bl", "blank"), ("btm", "btm"), ("c", "c"), ("cdl", "cdl"), ("cfi", "cf"), ("cfm", "cf"), ("chs", "chaskell"), ("eni", "cl"), ("dcl", "clean"), ("icl", "clean"), ("prg", "clipper"), ("cbl", "cobol"), ("cob", "cobol"), ("cpy", "cobol"), ("c++", "cpp"), ("cc", "cpp"), ("cpp", "cpp"), ("cxx", "cpp"), ("h", "cpp"), ("hh", "cpp"), ("hpp", "cpp"), ("hxx", "cpp"), ("inl", "cpp"), ("tcc", "cpp"), ("cs", "cs"), ("csc", "csc"), ("csh", "csh"), ("tcsh", "csh"), ("csp", "csp"), ("fdr", "csp"), ("css", "css"), ("con", "cterm"), ("pld", "cupl"), ("si", "cuplsim"), ("cyn", "cynpp"), ("d", "d"), ("def", "def"), ("desc", "desc"), ("diff", "diff"), ("patch", "diff"), ("rej", "diff"), ("bat", "dosbatch"), ("cmd", "dosbatch"), ("sys", "dosbatch"), ("ini", "dosini"), ("dot", "dot"), ("drac", "dracula"), ("drc", "dracula"), ("dsl", "dsl"), ("dtd", "dtd"), ("dylan", "dylan"), ("intr", "dylanintr"), ("lid", "dylanlid"), ("ecd", "ecd"), ("am", "elf"), ("erl", "erlang"), ("EC", "esqlc"), ("ec", "esqlc"), ("exp", "expect"), ("4gh", "fgl"), ("4gl", "fgl"), ("m4gl", "fgl"), ("fex", "focexec"), ("focexec", "focexec"), ("fs", "forth"), ("ft", "forth"), ("F", "fortran"), ("f", "fortran"), ("f77", "fortran"), ("f90", "fortran"), ("f95", "fortran"), ("for", "fortran"), ("fpp", "fortran"), ("ftn", "fortran"), ("gdmo", "gdmo"), ("mo", "gdmo"), ("ged", "gedcom"), ("gif", "gif"), ("gpi", "gnuplot"), ("gp", "gp"), ("gsp", "gsp"), ("hs", "haskell"), ("hb", "hb"), ("errsum", "hercules"), ("ev", "hercules"), ("rs", "hercules"), ("sum", "hercules"), ("vc", "hercules"), ("h32", "hex"), ("hex", "hex"), ("hog", "hog"), ("rules", "hog"), ("htm", "html"), ("htm", "html"), ("html", "html"), ("html", "html"), ("shtml", "html"), ("html.m4", "htmlm4"), ("icn", "icon"), ("idl", "idl"), ("Z", "ignore"), ("bak", "ignore"), ("bz2", "ignore"), ("gz", "ignore"), ("in", "ignore"), ("new", "ignore"), ("old", "ignore"), ("orig", "ignore"), ("rmpnew", "ignore"), ("rpmsave", "ignore"), ("indent.pro", "indent"), ("INF", "inform"), ("inf", "inform"), ("iss", "iss"), ("ist", "ist"), ("mst", "ist"), ("jpl", "jam"), ("jpr", "jam"), ("jav", "java"), ("java", "java"), ("jj", "javacc"), ("jjt", "javacc"), ("javascript", "javascript"), ("js", "javascript"), ("clp", "jess"), ("jgr", "jgraph"), ("jpg", "jpeg"), ("png", "png"), ("properties", "jproperties"), ("jsp", "jsp"), ("kix", "kix"), ("ks", "kscript"), ("k", "kwt"), ("ACE", "lace"), ("ace", "lace"), ("latte", "latte"), ("lte", "latte"), ("l", "lex"), ("lex", "lex"), ("lhs", "lhaskell"), ("ll", "lexpp"), ("cl", "lisp"), ("el", "lisp"), ("jl", "lisp"), ("lisp", "lisp"), ("lsp", "lisp"), ("lite", "lite"), ("lt", "lite"), ("lgt", "logtalk"), ("lot", "lotos"), ("lotos", "lotos"), ("lou", "lout"), ("lout", "lout"), ("sig", "lprolog"), ("lss", "lss"), ("lua", "lua"), ("mc", "m4"), ("eml", "mail"), ("dsp", "make"), ("mak", "make"), ("mk", "make"), ("man", "man"), ("mpl", "maple"), ("mv", "maple"), ("mws", "maple"), ("mason", "mason"), ("mhtml", "mason"), ("mel", "mel"), ("mf", "mf"), ("mgp", "mgp"), ("mib", "mib"), ("mms", "mmix"), ("moc", "moc"), ("DEF", "modula2"), ("MOD", "modula2"), ("m2", "modula2"), ("md", "modula2"), ("mi", "modula2"), ("i3", "modula3"), ("ig", "modula3"), ("m3", "modula3"), ("mg", "modula3"), ("isc", "monk"), ("monk", "monk"), ("ssc", "monk"), ("tsc", "monk"), ("moo", "moo"), ("mp", "mp"), ("msql", "msql"), ("mush", "mush"), ("mysql", "mysql"), (".NSA", "natural"), (".NSC", "natural"), (".NSG", "natural"), (".NSL", "natural"), (".NSM", "natural"), (".NSN", "natural"), (".NSP", "natural"), (".NSS", "natural"), ("ncf", "ncf"), ("nqc", "nqc"), ("OPL", "opl"), ("OPl", "opl"), ("Opl", "opl"), ("dpr", "pascal"), ("g", "pccts"), ("inc", "php"), ("ml", "ocaml"), ("mli", "ocaml"), ("mll", "ocaml"), ("mly", "ocaml"), ("mm", "nroff"), ("nr", "nroff"), ("nsi", "nsis"), ("o", "object"), ("obj", "object"), ("opl", "opl"), ("or", "openroad"), ("ora", "ora"), ("papp", "papp"), ("pas", "pascal"), ("php", "php"), ("php", "php"), ("pl", "perl"), ("pxml", "papp"), ("pxsl", "papp"), ("roff", "nroff"), ("sho", "dllobject"), ("sob", "dllobject"), ("tr", "nroff"), ("xin", "omnimark"), ("xom", "omnimark"), ("php3", "php"), ("phtml", "phtml"), ("lpc", "pike"), ("pike", "pike"), ("pmod", "pike"), ("ulpc", "pike"), ("rcp", "pilrc"), ("p36", "plm"), ("pac", "plm"), ("plm", "plm"), ("plp", "plp"), ("pls", "plsql"), ("plsql", "plsql"), ("po", "po"), ("pod", "pod"), ("eps", "postscript"), ("ps", "postscript"), ("pov", "pov"), ("ppd", "ppd"), ("ih", "ppwiz"), ("it", "ppwiz"), ("pdb", "prolog"), ("psf", "psf"), ("py", "python"), ("py", "python"), ("mat", "radiance"), ("rad", "radiance"), ("rc", "rc"), ("rex", "rexx"), ("rexx", "rexx"), ("x", "rpcgen"), ("rpl", "rpl"), ("rtf", "rtf"), ("rbw", "ruby"), ("rbw", "ruby"), ("sas", "sas"), ("sa", "sather"), ("scm", "scheme"), ("sci", "scilab"), ("pdl", "sdl"), ("pr", "sdl"), ("sed", "sed"), ("sgm", "sgml"), ("sgml", "sgml"), ("bash", "sh"), ("ebuild", "sh"), ("env", "sh"), ("ksh", "sh"), ("sh", "sh"), ("sh", "sh"), ("sim", "simula"), ("s85", "sinda"), ("sin", "sinda"), ("il", "skill"), ("sl", "slang"), ("score", "slrnsc"), ("tpl", "smarty"), ("smith", "smith"), ("smt", "smith"), ("sml", "sml"), ("sno", "snobol4"), ("spec", "spec"), ("sp", "spice"), ("spice", "spice"), ("spd", "spup"), ("spdata", "spup"), ("speedup", "spup"), ("pkb", "sql"), ("pks", "sql"), ("sql", "sql"), ("tyb", "sql"), ("tyc", "sql"), ("typ", "sql"), ("sqlj", "sqlj"), ("sqi", "sqr"), ("sqr", "sqr"), ("s19", "srec"), ("s28", "srec"), ("s37", "srec"), ("cls", "st"), ("st", "st"), ("stp", "stp"), ("tak", "tak"), ("itcl", "tcl"), ("itk", "tcl"), ("tar", "tar"), ("tar.bz2", "tarbz2"), ("tar.gz", "targz"), ("tgz", "targz"), ("tcl", "tcl"), ("tk", "tcl"), ("ti", "terminfo"), ("dtx", "tex"), ("latex", "tex"), ("ltx", "tex"), ("sty", "tex"), ("tex", "tex"), ("texi", "texinfo"), ("texinfo", "texinfo"), ("txi", "texinfo"), ("tf", "tf"), ("t.html", "tilde"), ("tli", "tli"), ("slt", "tsalt"), ("tsscl", "tsscl"), ("tssgm", "tssgm"), ("tssop", "tssop"), ("uc", "uc"), ("ui", "ui"), ("uil", "uil"), ("uit", "uil"), ("ctl", "vb"), ("dsm", "vb"), ("sba", "vb"), ("vbs", "vb"), ("v", "verilog"), ("hdl", "vhdl"), ("vbe", "vhdl"), ("vhd", "vhdl"), ("vhdl", "vhdl"), ("vst", "vhdl"), ("vim", "vim"), ("hw", "virata"), ("module", "virata"), ("pkg", "virata"), ("wrl", "vrml"), ("wm", "webmacro"), ("wbt", "winbatch"), ("wml", "wml"), ("doc", "word"), ("wsc", "wsh"), ("wsf", "wsh"), ("ad", "xdefaults"), ("msc", "xmath"), ("msf", "xmath"), ("xpm2", "xpm2"), ("xs", "xs"), ("xsd", "xsd"), ("xsl", "xslt"), ("y", "yacc"), ("yy", "yaccpp"), ("zip", "zip"), ("z8a", "z8a"), ] # # The default list of detected file types by regexp. # The order matters here! Last item is checked first. # _def_regexp_list = [ ("[cC]hange[lL]og", "changelog", 1), ("/var/named/", "bindzone", 0), ("crontab", "crontab", 1), (".*\\drac\\.", "dracula", 0), (".*fvwmrc", "fvwm", 0), (".*fvwm95", "fvwm", 0), (".*fvwm2rc", "fvwm", 0), ("\\.gtkrc", "gtkrc", 1), ("gtkrc", "gtkrc", 1), ("Prl.*\\.", "jam", 1), ("JAM.*\\.", "jam", 1), ("[mM]akefile", "make", 1), ("muttrc", "muttrc", 1), ("tmac\\.", "nroff", 1), (".*printcap", "printcap", 0), (".*termcap", "termcap", 0), (".*vimrc", "vim", 0), ("Xresources", "xdefaults", 1), (".*/app-defaults/", "xdefaults", 0), (".*/Xresources/", "xdefaults", 0), ("XF86Config", "xf86conf", 1), (".*xmodmap", "xmodmap", 0), ("zsh", "zsh", 1), ("zlog", "zsh", 1), ("xdm-config$", "xdefaults", 1), ("\\.Xresources$", "xdefaults", 1), ("\\.Xpdefaults$", "xdefaults", 1), ("\\.Xdefaults$", "xdefaults", 1), ("XF86Config$", "xf86conf", 1), ("cvs\\d+$", "cvs", 1), ("wvdial\\.conf$", "wvdial", 1), ("\\wgetrc$", "wget", 1), ("\\.wgetrc$", "wget", 1), ("vgrindefs$", "vgrindefs", 1), ("\\.viminfo", "viminfo", 1), ("\\_viminfo", "viminfo", 1), (".*\\.vhdl_[0-9]*$", "vhdl", 0), ("\\tidyrc$", "tidy", 1), ("\\.tidyrc$", "tidy", 1), ("texmf\\.cnf$", "texmf", 1), ("tags$", "tags", 1), ("squid\\.conf$", "squid", 1), ("vision\\.conf$", "hog", 1), ("snort\\.conf$", "hog", 1), ("\\.lrnrc", "slrnrc", 1), ("screenrc$", "screen", 1), ("\\.screenrc$", "screen", 1), ("\\.zcompdump", "zsh", 1), ("\\.zfbfmarks$", "zsh", 1), ("\\.zprofile$", "zsh", 1), ("\\.zlog", "zsh", 1), ("\\.zsh", "zsh", 1), ("csh\\.logout$", "csh", 1), ("csh\\.login$", "csh", 1), ("csh\\.cshrc$", "csh", 1), ("\\.alias", "csh", 1), ("\\.tcshrc", "csh", 1), ("\\.cshrc", "csh", 1), ("\\.login", "csh", 1), ("\\.profile", "sh", 1), ("/etc/profile", "sh", 0), ("\\.kshrc", "sh", 1), ("\\.bashrc", "sh", 1), ("bashrc", "sh", 1), ("bash\\.bashrc", "sh", 1), ("\\.bash_profile", "sh", 1), ("\\.bash_logout", "sh", 1), ("sgml\\.catalog", "catalog", 1), ("catalog$", "catalog", 1), ("sendmail\\.cf", "sendmail", 1), ("smb\\.conf", "samba", 1), ("robots.txt", "robots", 1), ("\\.reminders", "remind", 1), ("\\.inputrc$", "readline", 1), ("\\.ratpoisonrc$", "ratpoison", 1), ("\\ratpoisonrc$", "ratpoison", 1), ("\\.procmail$", "procmail", 1), ("\\.procmailrc$", "procmail", 1), (".*printcap$", "printcap", 0), (".*termcap$", "termcap", 0), ("\\.povrayrc$", "povini", 1), ("main.cf$", "pfmain", 1), ("\\.pinerc$", "pine", 1), ("\\pinerc$", "pine", 1), ("\\.muttrc", "muttrc", 1), ("\\.mutt/muttrc", "muttrc", 1), ("Muttrc$", "muttrc", 1), ("[mM]akefile$", "make", 1), ("GNUmakefile$", "make", 1), ("snd.\\d+$", "mail", 1), ("\\.letter$", "mail", 1), ("\\.letter\\.\\d+$", "mail", 1), ("\\.followup$", "mail", 1), ("\\.article$", "mail", 1), ("\\.article\\.\\d+$", "mail", 1), ("\\pico\\.\\d+$", "mail", 1), ("\\mutt-.*-\\d+$", "mail", 1), ("\\mutt\\w{6}$", "mail", 1), ("\\ae\\d+\\.txt$", "mail", 1), ("/tmp/SLRN[0-9A-Z.]+$", "mail", 0), ("\\.emacs$", "lisp", 1), ("\\.sawfishrc$", "lisp", 1), ("lilo.conf", "lilo", 1), ("lftp.conf$", "lftp", 1), ("\\.lftprc$", "lftp", 1), (".*lftp/rc$", "lftp", 0), (".*properties_..$", "jproperties", 0), (".*properties_.._..$", "jproperties", 0), (".*properties_.._.._.*$", "jproperties", 0), ("inittab$", "inittab", 1), ("\\.gtkrc$", "gtkrc", 1), ("gtkrc$", "gtkrc", 1), ("gkrellmrc_.$", "gkrellmrc", 1), ("gkrellmrc$", "gkrellmrc", 1), ("\\.gdbinit$", "gdb", 1), ("fstab$", "fstab", 1), ("auto.master$", "conf", 1), ("exports$", "exports", 1), ("filter-rules$", "elmfilt", 1), (".*lvs$", "dracula", 0), (".*lpe$", "dracula", 0), ("debian/control$", "debcontrol", 1), (".*\\.\\.ch$", "ch", 0), ("named\\.conf$", "named", 1), ("named\\.root$", "bindzone", 1), ("build\\.xml$", "ant", 1), (".*vimrc$", "vim", 0), (".*exrc$", "vim", 0), ("configure$", "sh", 1), ("configure.ac$", "config", 1), (".*COPYING$", "text", 0), (".*README$", "text", 0), (".*read.me$", "text", 0), ("proftpd\\.conf", "apachestyle", 1), ("httpd\\.conf", "apache", 1), ("srm\\.conf", "apache", 1), ("access\\.conf", "apache", 1), ("apache\\.conf", "apache", 1), ("\\.htaccess$", "apache", 1), (".*enlightenment/.*\\.cfg$", "c", 0), (".*Eterm/.*\\.cfg$", "eterm", 0), ("lynx\\.cfg$", "lynx", 1), (".*baseq[2-3]/.*\\.cfg$", "quake", 0), (".*id1/.*\\.cfg$", "quake", 0), (".*quake[1-3]/.*\\.cfg$", "quake", 0), ("crontab$", "crontab", 1), ] # # The default list of detected file types by script name. # _def_script_list = [ (".*\\bpython", "python"), (".*\\bperl", "perl"), (".*\\bphp", "php"), (".*\\bruby", "ruby"), (".*\\bbc\\b", "bc"), (".*\\bsed\\b", "sed"), (".*\\bocaml", "ocaml"), (".*awk\\b", "awk"), (".*wml\\b", "wml"), (".*\\bksh\\b", "sh"), (".*\\bsh\\b", "sh"), (".*\\bbash", "sh"), (".*csh\\b", "csh"), (".*\\bzsh\\b", "zsh"), (".*\\btclsh\\b", "tcl"), (".*\\bwish\\b", "tcl"), (".*\\bexpectk\\b", "tcl"), (".*\\bitclsh\\b", "tcl"), (".*\\bitwish\\b", "tcl"), (".*\\bexpect\\b", "expect"), (".*\\bgnuplot\\b", "gnuplot"), (".*make\\b", "make"), ] # # The default list of detected file types with Python code. # _def_python_list = [ ("am", 0, """ # Use Python to avoid the .am suffix is recognized if string.lower(fname_base) == "makefile.am": type = "automake" """), ("bas,frm", 0, """ if string.lower(fname[-3:]) == "frm": type = "form" else: type = "basic" f = open(fname) l = '' try: for i in xrange(1,5): l = l + f.readline() except: pass f.close() if re.search("VB_Name|Begin VB\\\\.(Form|MDIForm|UserControl)", l, re.I): type = "vb" """), ("ch", 0, """ type = "ch" f = open(fname) try: for i in xrange(1,10): if f.readline()[0] == '@': type = "change" break except: pass f.close() """), ("e,E", 0, """ type = "eiffel" f = open(fname) try: for i in xrange(1,100): if cre_match("\\\\s*(<'|'>)\\\\s*$", f.readline()): type = "specman" break except: pass f.close() """), ("ent", 0, """ type = "dtd" f = open(fname) try: for i in xrange(1,6): l = f.readline() if cre_match("\\\\s*[#{]", l): type = "cl" break if not cre_match("\\\\s*$", l): break except: pass f.close() """), ("rul", 0, """ type = "diva" f = open(fname) try: for i in xrange(1,6): if string.find("InstallShield", f.readline()): type = "ishd" break except: pass f.close() """), ("com", 0, """ type = "dcl" f = open(fname) try: l1 = f.readline() + f.readline() l2 = f.readline() + f.readline() if (cre_search("\\\\$ORIGIN|\\\\$TTL|IN\\\\s*SOA", l1) or cre_search("BIND.*named", l1 + l2)): type = "dns" except: pass f.close() """), ("in", 0, """ # Use Python to avoid the .in suffix is recognized if fname_base == "configure.in": type = "config" """), ("m", 0, """ type = "matlab" f = open(fname) try: for i in xrange(1,10): l = f.readline() if cre_match("\\\\s*#(include|import)", l): type = "objc" break if cre_match("\\\\s*%", l): break if cre_match("\\\\s*\\\\(\\\\*", l): type = "mma" break except: pass f.close() """), ("mod", 0, """ type = "modsim3" f = open(fname) try: if cre_search("\\\\bmodule\\\\b", f.readline()): type = "lprolog" except: pass f.close() """), ("1,2,3,4,5,6,7,8,9,t,ms", 0, """ f = open(fname) found = 0 try: for i in xrange(1,5): l = f.readline() if not l: break if l[0] == '.': type = "nroff" found = 1 break except: pass f.close() if not found: if fname[-1] == 't': type = "tads" elif fname[-1] == 's': type = "xmath" """), ("pl", 0, """ type = "perl" f = open(fname) try: while 1: l = f.readline() if l: break except: pass f.close() if (cre_search("\\\\bprolog\\\\b|:-", l) or cre_match("\\\\s*(%+(\\\\s|$)|/\\\\*)", l)): type = "prolog" """), ("pm", 0, """ type = "perl" f = open(fname) try: l = f.readline() except: pass f.close() if cre_search("XPM2", l): type = "xpm2" elif cre_search("XPM", l): type = "xpm" """), ("inc", 0, """ type = "php" f = open(fname) l = '' try: for i in xrange(1,3): l = l + f.readline() except: pass f.close() if cre_search("perlscript", l): type = "aspperl" elif cre_search("<%", l): type = "aspvbs" elif cre_search("", f.readline()): type = "xml" except: pass f.close() """), ("smi", 0, """ type = "mib" f = open(fname) try: if cre_search("\\\\bsmil\\\\b", f.readline()): type = "smil" except: pass f.close() """), ("web", 0, """ type = "winbatch" f = open(fname) try: for i in xrange(0,5): if f.readline()[0] == '%': type = "web" break except: pass f.close() """), ("xpm", 0, """ type = "xpm" f = open(fname) try: if cre_search("XPM2", f.readline()): type = "xpm2" except: pass f.close() """), ("xml", 0, """ type = "xml" """), ("", 0, """ while 1: if fname == "INDEX" or fname == "INFO": f = open(fname) try: if cre_match("\\\\s*(distribution|installed_software|root|bundle|product)\\\\s*$", f.readline()): type = "psf" f.close() break except: pass f.close() if string.find("jarg", fname): f = open(fname) try: for i in xrange(0,5): if re.search("THIS IS THE JARGON FILE", f.readline(), re.I): type = "jargon" break except: pass f.close() break """), ("", 1, """ if ignore and fname[-1] == '~': type = ft_detect(fname[:-1], 1) """), ("", 1, """ f = open(fname) line1 = f.readline() lines = ['', line1, '', '', '', ''] for i in xrange(2, 6): try: lines[i] = f.readline() except: break if line1 and line1[0] == ':' and line1[1] == '\\\\n': type = "sh" elif cre_match("#(compdef|autoload)\\\\b", line1): type = "zsh" elif cre_match("From [a-zA-Z][a-zA-Z_0-9\\\\.=-]*(@[^ ]*)? .*[12][09]\\\\d\\\\d$", line1): type = "mail" elif cre_match("<[%&].*>", line1): type = "mason" elif cre_match('" *[vV]im$', line1): type = "vim" elif cre_match("\\\\*\\\\* LambdaMOO Database, Format Version", line1): type = "moo" elif (cre_match("diff\\\\b|Only in |\\\\d+(,\\\\d+)?[cda]\\\\d+\\\\b|# It was generated by makepatch |Index:\\\\s+\\\\S+$|==== //\\\\S+#\\\\d+", line1) or (cre_match("--- ", line1) and cre_match("+++ ", lines[2])) or (cre_match("\\\\*\\\\*\\\\* ", line1) and cre_match("--- ", lines[2]))): type = "diff" elif cre_match("%!\\\\s*PS", line1): type = "postscript" elif (cre_match("\\\\s*dnl\\\\b", line1) or cre_match("\\\\s*dnl\\\\b", lines[2]) or cre_match("\\\\s*dnl\\\\b", lines[3]) or cre_match("\\\\s*dnl\\\\b", lines[4]) or cre_match("\\\\s*dnl\\\\b", lines[5])): type = "m4" elif re.match(" *proc[nd] *$", line1, re.I): type = "sicad" elif cre_match("\\\\*\\\\*\\\\* Purify", line1): type = "purifylog" elif cre_search("<\\\\?\\\\s*xml.*\\\\?>", line1): type = "xml" elif cre_match("[0-9a-fA-F]{7}: [0-9a-fA-F]{2} [0-9a-fA-F]{2} [0-9a-fA-F]{2} [0-9a-fA-F]{2} ", line1): type = "xxd" elif cre_match("RCS file:", line1) or cre_match("RCS file:", lines[2]): type = "rcslog" elif cre_match("CVS:", lines[2]): type = "cvs" elif cre_match("SEND-PR:", line1): type = "sendpr" elif cre_match("SNNS network definition file", line1): type = "snnsnet" elif cre_match("SNNS pattern definition file", line1): type = "snnspat" elif cre_match("SNNS result file", line1): type = "snnsres" elif (cre_match("%.*?[Vv]irata", line1) or cre_match("%.*?[Vv]irata", lines[2]) or cre_match("%.*?[Vv]irata", lines[3]) or cre_match("%.*?[Vv]irata", lines[4]) or cre_match("%.*?[Vv]irata", lines[5])): type = "virata" elif cre_match("[0-9]* *execve\\\\(", line1): type = "strace" elif (cre_search("K & K Associates", lines[4]) or cre_search("TAK 2000", lines[2])): type = "takout" elif cre_search("S Y S T E M S I M P R O V E D ", lines[3]): type = "sindaout" # takcmp and sindacmp skipped elif (cre_search("\\\\$ORIGIN|\\\\$TTL|IN\\\\s*SOA", line1 + lines[2]) or cre_search("BIND.*named", line1 + lines[2] + lines[3] + lines[4])): type = "dns" elif ((cre_search("\\\\|\\\\*{1,80}", line1) and cre_search("VRC ", lines[2])) or (cre_search("\\\\|\\\\*{1,80}", lines[2]) and cre_search("VRC ", lines[3]))): type = "baan" elif cre_match("==\\\\d+== valgrind", line1): type = "valgrind" else: line = None for i in xrange(1,6): if not cre_match("\\\\? ", lines[i]): line = lines[i] break if not line: while 1: try: l = f.readline() if not cre_match("\\\\? ", l): line = l break except: break if line and cre_match("Index:\\\\s+\\\\S+$", line): type = "diff" f.close() """), ("mas,master", 1, """ type = "master" """), ("m4", 1, """ type = "m4" """), ("me", 1, """ type = "nroff" """), ("txt", 1, """ type = "text" """), ("inp", 1, """ f = open(fname) try: l = f.readline() if l[0] == '*': type = "abaqus" else: for i in xrange(1, 500): if len(l) >= 19 and string.lower(l[:19]) == "header surface data": type = "trasys" break l = f.readline() except: pass f.close() """), ("asp", 1, """ type = "aspvbs" f = open(fname) try: l = f.readline() l = l + f.readline() l = l + f.readline() except: pass if string.find("perlscript", string.lower(l)) >= 0: type = "aspperl" f.close() """), ("cfg", 1, """ type = "cfg" """), ] # # The extra list of detected file types for case sensitive systems. # if os.name == "posix": _case_detect_list = """ suffix L lisp suffix C cpp suffix H cpp """ # List of _Ft_py objects: Python code executed to detect file type. # Used first. _py_list_before = [] # Dictionary used to map file name extension to file type. _suffix_dict = {} # List of _Ft_re objects; a match of the RE with the file name defines the file # type. _regexp_list = [] # List of _Ft_re objects: a match of the RE with the script in the first line # of the file defines the file type. _script_list = [] # List of _Ft_py objects: Python code executed to detect file type. # Used after everything else didn't detect the type. _py_list_after = [] # The detected file types are cached. This assumes the file type doesn't # change while executing recipes. Would this every be false? # Index in the list is "ignore". _cache_dict = [{}, {}] # Dictionary of known filetypes (only the keys are important) _filetype_dict = {} # List of types from the builtin python scripts. # Generated with the following shell command: # # grep 'type[[:space:]]*=[[:space:]]*"' Filetype.py | \ # sed -e 's,""",,' | \ # sed -e 's,[^"]*",,' -e 's,".*,,' | \ # sort | uniq \ # sed -e 's,^, ",' -e 's/$/",/' # # This is used to pre-populate _filetype_dict. Update this list if the list of # builtin Python detected-types changes. _filetype_pre_list = [ "abaqus", "asm", "aspperl", "aspvbs", "automake", "baan", "basic", "cfg", "ch", "change", "cl", "config", "cvs", "cweb", "dcl", "diff", "diva", "dns", "dtd", "eiffel", "form", "ishd", "jargon", "lprolog", "m4", "mail", "mason", "master", "matlab", "mib", "mma", "modsim3", "moo", "nroff", "objc", "pascal", "perl", "php", "postscript", "progress", "prolog", "psf", "purifylog", "rcslog", "rebol", "registry", "rexx", "sendpr", "sgmldecl", "sh", "sicad", "sindaout", "smil", "snnsnet", "snnspat", "snnsres", "specman", "strace", "tads", "takout", "text", "trasys", "valgrind", "vb", "vim", "virata", "web", "winbatch", "xmath", "xml", "xpm", "xpm2", "xxd", "zsh", # End of the list of grepped types. # The remainder of these types is internal to AAP and cannot be detected. "libobject", "ltobject" ] _did_init = 0 # non-zero when __init__() did its work def __init__(): global _suffix_dict, _regexp_list, _script_list global _py_list_before, _py_list_after global _did_init global _filetype_dict # this only needs to be done once if _did_init: return _did_init = 1 _py_list_before = [] _suffix_dict = {} _regexp_list = [] _script_list = [] _py_list_after = [] _filetype_dict = {} # Load the built-in detection rules. _add_suffixlist(_def_suffix_list) _add_regexplist(_def_regexp_list) _add_scriptlist(_def_script_list) _add_pythonlist(_def_python_list) if os.name == "posix": ft_add_rules(_case_detect_list, 1) # Load detection rules from system and user *.afd files. for dirpath in default_dirs({}): ft_check_dir(os.path.join(dirpath, "afd")) # Declare all the filetypes known from the builtin Python bits for i in _filetype_pre_list: _filetype_dict[i] = 1 class DetectError(Exception): """Error for something gone wrong.""" def __init__(self, args = None): Exception.__init__(self) self.args = args def ft_known(type): """Return True when "type" is a known filetype.""" __init__() return _filetype_dict.has_key(type) def ft_declare(type): """Delcare "type" to be a known filetype.""" __init__() _filetype_dict[type] = 1 def ft_check_dir(dir, errmsg = 0, recdict = None): """Check directory "dir" for *.afd files and load them. When "errmsg" is non-zero give an error message when the directory doesn't exist.""" if os.path.exists(dir) and os.path.isdir(dir): for f in glob.glob(os.path.join(dir, "*.afd")): try: ft_read_file(f, recdict) except DetectError, e: if _run_as_program: print str(e) else: from Message import msg_error msg_error(recdict, str(e)) elif errmsg: e = _('Directory does not exist: "%s"') % dir if _run_as_program: print e else: from Message import msg_error msg_error(recdict, e) def ft_read_file(fname, recdict = None): """Read file "fname" for file type detection rules.""" try: fd = open(fname) except IOError, e: raise DetectError, (_('Cannot open "%s": ') % fname) + str(e) try: s = fd.read() except IOError, e: raise DetectError, (_('Cannot read "%s": ') % fname) + str(e) fd.close() ft_add_rules(s, 1, recdict) def ft_add_rules(dtstr, recipe_line_nr, recdict = None): """Add file type detection rules from string "dtstr". "recipe_line_nr" is the first line number in a recipe, zero when not reading a recipe.""" # Always load the default rules first (skipped when done already). __init__() # Split the string into individual lines. lines = string.split(dtstr, '\n') # Loop over all the lines (may use more than one for python items). # Note: using skip_white() and skip_to_white() is avoided here for speed. line_idx = 0 line_count = len(lines) while line_idx < line_count: line = lines[line_idx] # isolate first word: type of detection. items = string.split(line, None, 1) # ignore empty and comment lines if len(items) < 1 or items[0][0] == '#': line_idx = line_idx + 1 continue itype = items[0] if len(items) < 2: rline = '' else: rline = items[1] rline_len = len(rline) # isolate first argument, which may be in quotes as = 0 if as < rline_len: if rline[as] == '"' or rline[as] == "'": quote = rline[as] as = as + 1 ae = as while ae < rline_len and rline[ae] != quote: ae = ae + 1 if ae == rline_len: raise DetectError, (_('Missing quote in line %d: "%s"') % (line_idx + recipe_line_nr, line)) n = ae + 1 else: ae = as while ae < rline_len and rline[ae] != ' ' and rline[ae] != '\t': ae = ae + 1 n = ae arg1 = rline[as:ae] else: arg1 = '' n = rline_len # Isolate further arguments (no quotes!). # A superfluous argument is silently ignored (could be a comment). args = string.split(rline[n:]) if len(args) >= 1: arg2 = args[0] else: arg2 = '' if len(args) >= 2: arg3 = args[1] else: arg3 = '' if len(args) >= 3: arg4 = args[2] else: arg4 = '' if ((itype in ["suffix", "regexp", "script"] and not arg2) or (itype == "declare" and not arg1)): raise DetectError, (_('Missing argument in line %d: "%s"') % (line_idx + recipe_line_nr, line)) # Just declare a filetype if itype == "declare": _filetype_dict[arg1] = 1 # Filetype file file suffix elif itype == "suffix": _add_suffix(arg1, arg2) # Filetype based on a regex match of the filename elif itype == "regexp": _add_regexp(arg1, arg2, arg3 == "tail" or arg4 == "tail", arg3 == "append" or arg4 == "append") # Filetype based on checking the #! line for an interpreter elif itype == "script": _add_script(arg1, arg2, arg3 and arg3 == "append") # Magic python-based filetype detection elif itype == "python": append = 0 after = 0 suffix = None for arg in [arg1, arg2, arg3]: if arg: if arg == "append": append = 1 elif arg == "after": after = 1 elif not suffix: suffix = arg else: raise DetectError, ( _('Illegal argument in line %d: "%s"') % (line_idx + recipe_line_nr, line)) start_indent = get_indent(line) line_idx = line_idx + 1 start_line_idx = line_idx + recipe_line_nr cmds = "" while line_idx < line_count: line = lines[line_idx] if get_indent(line) <= start_indent: # Ignore empty and comment lines. i = skip_white(line, 0) if i < len(line) and line[i] != '#': line_idx = line_idx - 1 # this line has next item break cmds = cmds + line + '\n' line_idx = line_idx + 1 if not cmds: raise DetectError, (_('Python commands missing in line %d') % (line_idx + recipe_line_nr)) _add_python(cmds, _("filetype detection; python code at line %d: ") % start_line_idx, after, append, suffix) else: raise DetectError, ( _('Illegal item "%s" in argument to ft_add_rules(): %s') % (itype, line)) line_idx = line_idx + 1 class _Ft_re: """Class used to store pairs of RE and file type.""" def __init__(self, regexp, type, tail): self.re = regexp self.type = type self.tail = tail # match tail of filename self.cre = None def comp(self): """Get the compiled regexp, cache the result.""" try: self.cre = re.compile(self.re) except StandardError, e: raise DetectError, (_('Error in filetype detection regexp "%s": ') % self.re) + str(e) class _Ft_py: """Class used to store Python code for detecting a file type.""" def __init__(self, code, suffix, error_msg): self.code = code # the Python code as a string self.ccode = None # the compiled Python code self.suffix = suffix # the list of required suffixes or None self.error_msg = error_msg # a message used for errors def compile(self): if not self.ccode: # DEBUG # print "compling for suffix: ", self.suffix # print "compiling code: ", self.code # Prepend "if 1:" to get the indenting right. if self.code[0] == ' ' or self.code[0] == '\t': tcode = "if 1:\n" + self.code else: tcode = self.code try: self.ccode = compile(tcode, 'filetype detection rules', 'exec') except StandardError, e: raise DetectError, (_('Error in Python code (%s): ') % self.error_msg) + str(e) def _add_suffix(suf, type): """Add detection of "type" by file name extension "suf". When "type" is "ignore" it means the suffix is removed and further detection done on the rest. When "type" is "remove" an existing detection for "suf" is removed.""" if type == 'remove': if _suffix_dict.has_key(suf): del _suffix_dict[suf] else: _suffix_dict[suf] = type _filetype_dict[type] = 1 def _add_suffixlist(list): """Add suffix rules from a list of suffix-type tuples.""" for suf, itype in list: _suffix_dict[suf] = itype _filetype_dict[itype] = 1 def _add_regexp(regexp, type, tail, append): """Add detection of "type" by matching the file name with Python regular expression "regexp". When append is non-zero, add to the end of the regexp rules. When "type" is "remove" an existing detection for "regexp" is removed.""" if type == 'remove': for r in _regexp_list: if r.re == regexp: _regexp_list.remove(r) else: f = _Ft_re(regexp, type, tail) if append: _regexp_list.append(f) else: _regexp_list.insert(0, f) _filetype_dict[type] = 1 def _add_regexplist(list): """Add regexp rules from a list of regexp-type-tail tuples.""" for regexp, itype, tail in list: _add_regexp(regexp, itype, tail, 0) def _add_script(regexp, type, append): """Add detection of "type" by matching the script name in the first line of the file with Python regular expression "regexp". When append is non-zero, add to the end of the script rules. When "type" is "remove" an existing detection for "regexp" is removed.""" if type == 'remove': for r in _script_list: if r.re == regexp: _script_list.remove(r) else: f = _Ft_re(regexp, type, 0) _filetype_dict[type] = 1 if append: _script_list.append(f) else: _script_list.insert(0, f) def _add_scriptlist(list): """Add script rules from a list of scriptname-type tuples.""" for regexp, itype in list: _add_script(regexp, itype, 0) def _add_python(code, error_msg, after, append, suffix): """Add detection of "type" by using Python code "code". Each line in "code" must end in a '\n'. "error_msg" is printed when executing the code results in an error. When "after" is non-zero use this rule after suffix, regexp and script rules. When append is non-zero, add to the end of the python rules.""" if suffix: l = string.split(suffix, ',') else: l = [] p = _Ft_py(code, l, error_msg) if after: ilist = _py_list_after else: ilist = _py_list_before if append: ilist.append(p) else: ilist.insert(0, p) def _add_pythonlist(list): """Add python rules from a list of type-after-script tuples.""" msg = _("default rule") for suffix, after, script in list: _add_python(script, msg, after, 0, suffix) def _exec_py(fname, item, ignore): """Execute the code defined with _add_python().""" # Make a completely fresh recdict dictionary. exec_recdict["fname"] = fname exec_recdict["fname_base"] = os.path.basename(fname) exec_recdict["ft_detect"] = ft_detect exec_recdict["ignore"] = ignore if exec_recdict.has_key("type"): del exec_recdict["type"] item.compile() try: exec item.ccode in exec_recdict, exec_recdict except IOError, e: pass # ignore errors for reading the file except StandardError, e: raise DetectError, _(item.error_msg) + str(e) if exec_recdict.has_key("type"): return exec_recdict["type"] return None def ft_detect(fname, ignore = 0, recdict = None): """Detect the file type for file "fname". Returns the type as a string or None.""" # return quickly when already detected before if _cache_dict[ignore].has_key(fname): return _cache_dict[ignore][fname] if os.path.isdir(fname): _cache_dict[ignore][fname] = "directory" return "directory" # Internationalisation inits: setlocale and gettext. i18n_init() # Initialize (will skip when done already) __init__() # On non-Posix systems we ignore case differences by making the name lower # case. fname = fname_fold(fname) # Do the early python code checks. May first check if the suffix matches. i = string.rfind(fname, ".") if i > 0: suffix = fname[i + 1:] else: suffix = '' for p in _py_list_before: if not p.suffix or suffix in p.suffix: atype = _exec_py(fname, p, ignore) if atype: _cache_dict[ignore][fname] = atype return atype # Try the extension, this is fastest. # When "fname" has several extensions, try with all of them first, then # try by removing the first ones: "f.html.c": "html.c" then ".c". bn = os.path.basename(fname) i = string.find(bn, ".") while i > 0 and i + 1 < len(bn): # Found a dot that's not the first or last character. if _suffix_dict.has_key(bn[i + 1:]): ft = _suffix_dict[bn[i + 1:]] if ft == "ignore" and ignore: # remove an ignored extension and detect with that ft = ft_detect(fname[:-(len(bn[i:]))], 1, recdict) _cache_dict[ignore][fname] = ft return ft i = string.find(bn, ".", i + 1) # match all defined REs with the file name. # TODO: handle "/" in RE and fname. for r in _regexp_list: if not r.cre: r.comp() if r.tail: if r.cre.match(bn): _cache_dict[ignore][fname] = r.type return r.type else: if r.cre.match(fname): _cache_dict[ignore][fname] = r.type return r.type # match all defined REs with the script name in the first line of the # file. try: f = open(fname) line = f.readline() f.close() except: # Errors for files that can't be read are ignored. pass else: if len(line) > 2 and line[:2] == "#!": # TODO: remove "env VAR=val" and script arguments from line text = line[2:] for r in _script_list: if not r.cre: r.comp() if r.cre.match(text): _cache_dict[ignore][fname] = r.type return r.type # Do the python code checks. May first check if the suffix matches. i = string.rfind(bn, ".") if i > 0: suffix = bn[i + 1:] else: suffix = '' for p in _py_list_after: if not p.suffix or suffix in p.suffix: atype = _exec_py(fname, p, ignore) if atype: _cache_dict[ignore][fname] = atype return atype _cache_dict[ignore][fname] = None return None def filetype_root(ft): """When "ft" contains an underscore, return the part before the underscore. This is the basic filetype for user-defined filetypes. Return None otherwise.""" i = string.find(ft, '_') if i > 0: return ft[:i] return None # When executed as a program, detect the type of the specified file. if __name__ == '__main__': # Internationalisation inits: setlocale and gettext. i18n_init() items = [] checkfile = None _run_as_program = 1 # Check for any "-Idir", "-I dir", "-ffile" and "-f file" arguments. next_is_dir = 0 next_is_file = 0 for arg in sys.argv[1:]: if next_is_dir: items.extend({"dir" : arg}) next_is_dir = 0 elif next_is_file: items.extend({"file" : arg}) next_is_file = 0 elif len(arg) >= 2 and arg[:2] == "-I": if len(arg) > 2: items.extend({"dir" : arg[2:]}) else: next_is_dir = 1 elif len(arg) >= 2 and arg[:2] == "-f": if len(arg) > 2: items.extend({"file" : arg[2:]}) else: next_is_file = 1 else: if checkfile: print _("Can only check one file") sys.exit(1) checkfile = arg if next_is_dir: print _("-I argument must be followed by a directory name") sys.exit(1) if next_is_file: print _("-f argument must be followed by a file name") sys.exit(1) if not checkfile: print _("Usage: %s [-I ruledir] [-f rulefile] filename") % sys.argv[0] sys.exit(1) # load the built-in default rules __init__() # Check specified directories for *.afd files and read specified files. for item in items: if item.has_key("dir"): ft_check_dir(item["dir"]) else: try: ft_read_file(item["file"]) except DetectError, e: print e try: type = ft_detect(sys.argv[1]) if type == "ignore": print ft_detect(sys.argv[1], 1), "(ignored suffix)" else: print ft_detect(sys.argv[1]) except DetectError, e: sys.stderr.write("Detection error: " + str(e)) # vim: set sw=4 et sts=4 tw=79 fo+=l: