Skip to content
Snippets Groups Projects
sensitive_words.go 2.03 KiB
Newer Older
Francé Wilke's avatar
Francé Wilke committed
package logs

import (
	"bytes"
	"regexp"
	"strings"
)

//word may only consist of alpha-numerics with delimeters inside,
//e.g. OTP "12345" or Card number "1234-12345678-12345678-1234"
//and the delimiters may be spaces, dashes, underscores, slashes and dots
//and it is considered case insensitive

const sensitiveWordPattern = `[a-z0-9]([a-z0-9\.\\\/ _-]*[a-z0-9])*`

var sensitiveWordRegex = regexp.MustCompile("^" + sensitiveWordPattern + "$")

const delimiters = "()[]{}!@#$%^&*-=_+;:'\"|\\/?<>,.~` \n\r"

func FilterSensitiveWordsMap(s string, wordsMap map[string]bool) (filtered string, changed bool) {
	if len(wordsMap) == 0 {
		return s, false
	}

	changed = false
	f := []byte(s)
	for word := range wordsMap {
		//it will be inefficient to compile regex for each word in each context
		//much quicker to just look for the word and see if it is delimited as required
		//not to mach short words as part of longer words which may expose the word be assumption
		//e.g. OTP "202" should not match part of a date 2021-01-02 making it ***1-01-02
		wLen := len(word)
		offset := 0
		fLen := len(f)
		for offset < fLen {
			index := bytes.Index(f[offset:], []byte(word)) + offset
			if index < offset {
				break //word not found
			}

			//found the word, check delimiters before/after
			if index > 0 && strings.IndexByte(delimiters, f[index-1]) < 0 {
				offset = index + 1 //word match without delimiter before
				continue
			}

			if index+wLen < fLen && strings.IndexByte(delimiters, f[index+wLen]) < 0 {
				offset = index + 1 //word match without delimiter after
				continue
			}

			//has delimiter after, this is a word match, replace any length match with 3 stars "***"
			//pad length if required
			pad := 0
			for fLen < index+3 {
				f = append(f, ' ')
				fLen++
				pad++
			}
			f = append(f[:index+3], f[index+wLen:fLen-pad]...)
			f[index] = '*'
			f[index+1] = '*'
			f[index+2] = '*'
			fLen = len(f)
			changed = true
			offset = index + 3 //for loop skipped index over word, now skip offset over delimiter
		}
	}
	filtered = string(f)
	return
}