package main import ( "encoding/json" "flag" "fmt" "io/ioutil" "os" "path/filepath" "regexp" "strings" "github.com/cdipaolo/goml/base" "github.com/cdipaolo/goml/text" "github.com/rs/zerolog" "github.com/rs/zerolog/log" ) type logEntry struct { Who string Author string Body string Emojy string } type logs []logEntry type emojySet map[string]bool type MetaData struct { NClasses uint8 ClassList []string } func main() { log.Logger = log.With().Caller().Stack().Logger() log.Logger = log.Logger.Output(zerolog.ConsoleWriter{Out: os.Stderr}) logDir := flag.String("path", "", "path to logs") outFile := flag.String("out", "emojy.model", "path to store model") flag.Parse() if *logDir == "" { fmt.Fprintf(os.Stderr, "You must provide a log path.\n\n") flag.Usage() os.Exit(1) } logs, classes := prepLogs(*logDir) model, meta := bayes(logs, classes) err := model.PersistToFile(*outFile) if err != nil { log.Fatal().Err(err).Msgf("failed to save model") } metaJSON, err := json.Marshal(meta) if err != nil { log.Fatal().Err(err).Msgf("failed to save model") } err = ioutil.WriteFile(*outFile+".json", metaJSON, 0666) if err != nil { log.Fatal().Err(err).Msgf("failed to save model") } } var re = regexp.MustCompile(`(?i)^\[.+\] <(?P[[:punct:][:alnum:]]+)> reacted to (?P[[:punct:][:alnum:]]+): (?P.+) with :(?P[[:punct:][:alnum:]]+):$`) func prepLogs(path string) (logs, emojySet) { entries := logs{} emojies := emojySet{} err := filepath.Walk(path, func(path string, info os.FileInfo, err error) error { if err != nil { return err } if info.IsDir() { return nil } tmp, err := ioutil.ReadFile(path) content := string(tmp) if err != nil { return err } for _, line := range strings.Split(content, "\n") { if strings.Contains(line, "unknown event") { continue } if !re.MatchString(line) { continue } entry := parseEntry(line) emojies[entry.Emojy] = true log.Debug(). Interface("entry", entry). Str("line", line). Msgf("Found emojy reaction entry") entries = append(entries, entry) } return nil }) if err != nil { log.Fatal().Msgf("Error walking: %s", err) } return entries, emojies } func parseEntry(content string) logEntry { out := logEntry{} subs := re.FindStringSubmatch(content) if len(subs) == 0 { return out } for i, n := range re.SubexpNames() { switch n { case "Who": out.Who = subs[i] case "Author": out.Author = subs[i] case "Body": out.Body = subs[i] case "Emojy": out.Emojy = subs[i] } } return out } func bayes(logs logs, classes emojySet) (*text.NaiveBayes, MetaData) { // create the channel of data and errors stream := make(chan base.TextDatapoint, 100) errors := make(chan error) nClasses := uint8(len(classes)) classMap := map[string]uint8{} classList := []string{} for k, _ := range classes { classList = append(classList, k) classMap[k] = uint8(len(classList) - 1) } log.Debug().Strs("classList", classList).Interface("classMap", classMap).Int("nLogs", len(logs)).Msgf("about to train") // make a new NaiveBayes model with // 2 classes expected (classes in // datapoints will now expect {0,1}. // in general, given n as the classes // variable, the model will expect // datapoint classes in {0,...,n-1}) // // Note that the model is filtering // the text to omit anything except // words and numbers (and spaces // obviously) model := text.NewNaiveBayes(stream, nClasses, base.OnlyWordsAndNumbers) go model.OnlineLearn(errors) for _, l := range logs { stream <- base.TextDatapoint{ X: l.Body, Y: classMap[l.Emojy], } } close(stream) for { err := <-errors if err != nil { log.Error().Err(err).Msg("Error passed") } else { // training is done! break } } // now you can predict like normal in := "Should work properly once that number of documents increases." class := model.Predict(in) // 0 emojy := classList[class] log.Debug().Msgf("Class prediction for %s: %v", in, emojy) meta := MetaData{ NClasses: nClasses, ClassList: classList, } return model, meta }