#!/usr/bin/crm window isolate (:index_file:) /index/ # files to run, in TREC index format # that is, with one data per line like: # "ham"|"spam" relative/file/name isolate (:P:) // # if set with --P, do by paragraphs isolate (:L:) /100000/ # run microgroomed learn modulo this cycle isolate (:S:) /0/ # number of msgs to skip off start of index isolate (:N:) /1000/ isolate (:M:) /1000/ isolate (:ham_svm:) /ham.svm/ isolate (:spam_svm:) /spam.svm/ isolate (:ham_vs_spam:) /ham_vs_spam.svmhyp/ isolate (:total: :lino: :messago: :filo: :label: :stat:) isolate (:nham:) /0/ isolate (:nspam:) /0/ isolate (:correct:) /0/ #output /N = ':*:N:'\n/ eval (:total:)/:*:M: / input (:whole_index_file:) [:*:index_file:] # Skip first :S: files output /Skipping first :*:S: files.\n/ { eval /:@: :*:S: - 1 :/ (:S:) match [:whole_index_file:] /[^\n]*\n/ (:lino:) eval /:@: :*:S: > 0 :/ liaf } output /Learning the next :*:N: files.\n/ { eval /:@: :*:N: > 0 :/ #output /N = ':*:N:'\n/ { match [:whole_index_file:] /[^\n]*\n/ (:lino:) #output /:*:lino:/ match [:lino:] /([[:graph:]]+)[[:space:]]+([[:graph:]]+)/(:: :label: :filo:) input (:messago:) [:*:filo: 0 10000] #output /learning file :*:filo:\n/ { match /ham/ [:label:] #output /This is a ham!\n/ eval (:nham:) /:@: :*:nham: + 1:/ { match [:P:] /SET/ match [:messago:] (:one_para:) \ /([[:graph:]]+[[:space:]]+){7}.*?\n\n/ learn (:*:ham_svm:) < svm unigram unique> \ [:one_para:] /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/ liaf } alius learn (:*:ham_svm:) < svm unigram unique > \ [:messago:] /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/ } alius { match /spam/ [:label:] #output /This is a spam!\n/ eval (:nspam:) /:@: :*:nspam: + 1:/ { match [:P:] /SET/ match [:messago:] (:one_para:) \ /([[:graph:]]+[[:space:]]+){7}.*?\n\n/ learn (:*:spam_svm:) < svm unigram unique > \ [:one_para:] /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/ liaf } alius learn (:*:spam_svm:) < svm unigram unique > \ [:messago:] /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/ } } eval (:N:) /:@: :*:N: - 1:/ { eval /:@: :*:N: % :*:L: = 0 :/ learn (:*:ham_svm: |:*:spam_svm:|:*:ham_vs_spam:) \ < svm unigram unique microgroom > \ /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1.0/ output /*/ } liaf } output /total number of training spam is :*:nspam:\ntotal number of training ham is :*:nham:\n/ output /Calculating ideal SVM to separate the classes.\n/ learn (:*:ham_svm: |:*:spam_svm:|:*:ham_vs_spam:) < svm unigram unique > /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/ output /Starting classify on next :*:M: files.\n/ ##start classify { eval /:@: :*:M: > 0 :/ { match [:whole_index_file:] /[^\n]*\n/ (:lino:) output /start classify :*:lino:/ match [:lino:] /([[:graph:]]+)[[:space:]]+([[:graph:]]+)/(:: :label: :filo:) input (:messago:) [:*:filo: 0 10000] { classify (ham.svm|spam.svm|ham_vs_spam.svmhyp) (:stat:)[:messago:] < svm unigram unique > /[[:graph:]]+/ /0 0 100 1e-3 1 0.5 1/ } output /:*:stat:\n/ { match [:stat:] /succeeds/ match [:lino:] /ham/ eval (:correct:) /:@: :*:correct: + 1:/ } alius { match [:stat:] /fails/ match [:lino:] /spam/ eval (:correct:) /:@: :*:correct: + 1:/ } } eval (:M:) /:@: :*:M: - 1:/ liaf } output /total=:*:total:\n/ output /correct=:*:correct:\n/ eval (:correct:) /:@: (:*:correct:) \/ (:*:total:) : / eval (:correct:) /:@: :*:correct: * 100 :/ output /accuracy = :*:correct:%/ #output /total number of predicted texts = :*:total:,number of correctly classified texts = :*:correct:\n/