fix unicode decoder

This commit is contained in:
Zhang Minghan 2023-12-12 11:45:21 +08:00
parent bfac02db1d
commit 8b3041ded1
3 changed files with 20 additions and 9 deletions

View File

@ -167,7 +167,13 @@ func getRobustnessResult(chunk string) string {
matches := compile.FindStringSubmatch(chunk)
if len(matches) > 1 {
return matches[1]
partial := matches[1]
// if the unicode character is in the string, like `hi\\u2019s`, we need to convert it to `hi's`
if utils.ContainUnicode(partial) {
partial = utils.DecodeUnicode(partial)
}
return partial
} else {
return ""
}

View File

@ -91,9 +91,9 @@ func getRobustnessResult(chunk string) string {
matches := compile.FindStringSubmatch(chunk)
if len(matches) > 1 {
partial := matches[1]
// if is the unicode character
if strings.HasPrefix(partial, "\\u") {
return utils.DecodeUnicode(partial)
// if the unicode character is in the string, like `hi\\u2019s`, we need to convert it to `hi's`
if utils.ContainUnicode(partial) {
partial = utils.DecodeUnicode(partial)
}
return partial

View File

@ -152,16 +152,21 @@ func ExtractImageUrls(data string) []string {
return re.FindAllString(data, -1)
}
func ContainUnicode(data string) bool {
// like `hi\\u2019s` => true
re := regexp.MustCompile(`\\u([0-9a-fA-F]{4})`)
return re.MatchString(data)
}
func DecodeUnicode(data string) string {
// like `hi\\u2019s` => `hi's`
re := regexp.MustCompile(`\\u([0-9a-fA-F]{4})`)
return re.ReplaceAllStringFunc(data, func(s string) string {
if len(s) < 6 {
return s
}
val, err := strconv.ParseInt(s[2:], 16, 32)
unicode, err := strconv.ParseInt(s[2:], 16, 32)
if err != nil {
return s
}
return strconv.FormatInt(val, 10)
return string(rune(unicode))
})
}