In essence, what I'm looking to do is split a string into one-word and two-word segments, and train classifiers against them. First it lowercasifies the string, then strips all non-alphanumeric characters.
Then, it splits the strings into a list of one-word entries. Afterward, it creates a new *[]string which it fills with all sequential two-word entries.
In particular, I'm concerned about the way I'm creating the two-word *[]string slice, and the way I'm using pointers. The first is an efficiency issue, and the second is a, well, I suppose memory-efficiency issue.
const (
Spam bayesian.Class = "Spam"
Spamlike bayesian.Class = "Spamlike"
NotSpam bayesian.Class = "NotSpam"
)
type SpamFilter struct {
singleWordClassifier *bayesian.Classifier
doubleWordClassifier *bayesian.Classifier
}
func CreateNewSpamFilter() *SpamFilter {
v1 := bayesian.NewClassifier(Spam, Spamlike, NotSpam)
v2 := bayesian.NewClassifier(Spam, Spamlike, NotSpam)
tmp := SpamFilter{v1, v2}
return &tmp
}
func LoadSpamFilterFromFile(fileName1 string, fileName2 string) *SpamFilter {
tmp1, err1 := bayesian.NewClassifierFromFile(fileName1)
tmp2, err2 := bayesian.NewClassifierFromFile(fileName2)
switch {
case err1 != nil:
panic(err1)
case err2 != nil:
panic(err2)
}
return &SpamFilter{tmp1, tmp2}
}
func splitStringToSingleWords(input string) *[]string {
reg, err := regexp.Compile("[^A-Za-z0-9 ]+")
if err != nil {
panic(err)
}
tmp := reg.ReplaceAllString(input, "")
split := strings.Fields(tmp)
return &split
}
func SplitStringToWords(input string) (*[]string, *[]string) {
splitSingle := splitStringToSingleWords(strings.ToLower(input))
var splitDouble = make([]string, len(*splitSingle)-1)
for i := 0; i < len(*splitSingle)-1; i++ {
splitDouble[i] = (*splitSingle)[i] + " " + (*splitSingle)[i+1]
}
return splitSingle, &splitDouble
}
func (filter *SpamFilter) TrainStringsInSpamFilter(trainType bayesian.Class, singleWords *[]string, doubleWords *[]string) *SpamFilter {
(*filter).singleWordClassifier.Learn(*singleWords, trainType)
(*filter).doubleWordClassifier.Learn(*doubleWords, trainType)
return filter
}
func (filter *SpamFilter) TestStringThroughSpamFilter(testString string) (*[]float64, *[]float64) {
splitSingle, splitDouble := SplitStringToWords(testString)
p1, _, _ := (*filter).singleWordClassifier.ProbScores(*splitSingle)
p2, _, _ := (*filter).doubleWordClassifier.ProbScores(*splitDouble)
return &p1, &p2
}
How does this look, generally-speaking? What should I have done differently?
Also, copied here:
- How is my use of pointers (is it, well, correct)?
- Am I segmenting the strings in the most efficient possible way? I intend to use this on a rather large dataset.
- Will this be threadsafe/goroutine-friendly for large datasets?
Will this be threadsafe/goroutine-friendly for large datasets?What happens when you try? Also, define 'large'. \$\endgroup\$-raceargument. \$\endgroup\$