From 8b3041ded1902548a87c45012ed41c382f010b9d Mon Sep 17 00:00:00 2001 From: Zhang Minghan Date: Tue, 12 Dec 2023 11:45:21 +0800 Subject: [PATCH] fix unicode decoder --- adapter/chatgpt/processor.go | 8 +++++++- adapter/oneapi/processor.go | 6 +++--- utils/char.go | 15 ++++++++++----- 3 files changed, 20 insertions(+), 9 deletions(-) diff --git a/adapter/chatgpt/processor.go b/adapter/chatgpt/processor.go index 8b66e9b..798b3d0 100644 --- a/adapter/chatgpt/processor.go +++ b/adapter/chatgpt/processor.go @@ -167,7 +167,13 @@ func getRobustnessResult(chunk string) string { matches := compile.FindStringSubmatch(chunk) if len(matches) > 1 { - return matches[1] + partial := matches[1] + // if the unicode character is in the string, like `hi\\u2019s`, we need to convert it to `hi's` + if utils.ContainUnicode(partial) { + partial = utils.DecodeUnicode(partial) + } + + return partial } else { return "" } diff --git a/adapter/oneapi/processor.go b/adapter/oneapi/processor.go index b281a45..ada7c1e 100644 --- a/adapter/oneapi/processor.go +++ b/adapter/oneapi/processor.go @@ -91,9 +91,9 @@ func getRobustnessResult(chunk string) string { matches := compile.FindStringSubmatch(chunk) if len(matches) > 1 { partial := matches[1] - // if is the unicode character - if strings.HasPrefix(partial, "\\u") { - return utils.DecodeUnicode(partial) + // if the unicode character is in the string, like `hi\\u2019s`, we need to convert it to `hi's` + if utils.ContainUnicode(partial) { + partial = utils.DecodeUnicode(partial) } return partial diff --git a/utils/char.go b/utils/char.go index bd65c55..45a02e8 100644 --- a/utils/char.go +++ b/utils/char.go @@ -152,16 +152,21 @@ func ExtractImageUrls(data string) []string { return re.FindAllString(data, -1) } +func ContainUnicode(data string) bool { + // like `hi\\u2019s` => true + re := regexp.MustCompile(`\\u([0-9a-fA-F]{4})`) + return re.MatchString(data) +} + func DecodeUnicode(data string) string { + // like `hi\\u2019s` => `hi's` re := regexp.MustCompile(`\\u([0-9a-fA-F]{4})`) return re.ReplaceAllStringFunc(data, func(s string) string { - if len(s) < 6 { - return s - } - val, err := strconv.ParseInt(s[2:], 16, 32) + unicode, err := strconv.ParseInt(s[2:], 16, 32) if err != nil { return s } - return strconv.FormatInt(val, 10) + + return string(rune(unicode)) }) }