Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
package logs
import (
"bytes"
"regexp"
"strings"
)
//word may only consist of alpha-numerics with delimeters inside,
//e.g. OTP "12345" or Card number "1234-12345678-12345678-1234"
//and the delimiters may be spaces, dashes, underscores, slashes and dots
//and it is considered case insensitive
const sensitiveWordPattern = `[a-z0-9]([a-z0-9\.\\\/ _-]*[a-z0-9])*`
var sensitiveWordRegex = regexp.MustCompile("^" + sensitiveWordPattern + "$")
const delimiters = "()[]{}!@#$%^&*-=_+;:'\"|\\/?<>,.~` \n\r"
func FilterSensitiveWordsMap(s string, wordsMap map[string]bool) (filtered string, changed bool) {
if len(wordsMap) == 0 {
return s, false
}
changed = false
f := []byte(s)
for word := range wordsMap {
//it will be inefficient to compile regex for each word in each context
//much quicker to just look for the word and see if it is delimited as required
//not to mach short words as part of longer words which may expose the word be assumption
//e.g. OTP "202" should not match part of a date 2021-01-02 making it ***1-01-02
wLen := len(word)
offset := 0
fLen := len(f)
for offset < fLen {
index := bytes.Index(f[offset:], []byte(word)) + offset
if index < offset {
break //word not found
}
//found the word, check delimiters before/after
if index > 0 && strings.IndexByte(delimiters, f[index-1]) < 0 {
offset = index + 1 //word match without delimiter before
continue
}
if index+wLen < fLen && strings.IndexByte(delimiters, f[index+wLen]) < 0 {
offset = index + 1 //word match without delimiter after
continue
}
//has delimiter after, this is a word match, replace any length match with 3 stars "***"
//pad length if required
pad := 0
for fLen < index+3 {
f = append(f, ' ')
fLen++
pad++
}
f = append(f[:index+3], f[index+wLen:fLen-pad]...)
f[index] = '*'
f[index+1] = '*'
f[index+2] = '*'
fLen = len(f)
changed = true
offset = index + 3 //for loop skipped index over word, now skip offset over delimiter
}
}
filtered = string(f)
return
}