From a54e1230c9a8d18883b9da00c9ac8f99b79b4b7b Mon Sep 17 00:00:00 2001 From: Andrey Nering Date: Fri, 6 Feb 2026 15:43:39 -0300 Subject: [PATCH] feat: add `jsonrepair` package Added `charm.land/jsonrepair` package, to repair malformed JSON output by LLM models. This was written by GPT 5.2 Codex based on the Python version: https://github.com/mangiucugna/json_repair Assisted-by: GPT 5.2 Codex via Crush --- jsonrepair/jsonrepair.go | 1648 +++++++++++++++++++++++++++++++++ jsonrepair/jsonrepair_test.go | 1615 ++++++++++++++++++++++++++++++++ 2 files changed, 3263 insertions(+) create mode 100644 jsonrepair/jsonrepair.go create mode 100644 jsonrepair/jsonrepair_test.go diff --git a/jsonrepair/jsonrepair.go b/jsonrepair/jsonrepair.go new file mode 100644 index 0000000000000000000000000000000000000000..26aa5b98ac8370aab01156e882f2324253fdfccd --- /dev/null +++ b/jsonrepair/jsonrepair.go @@ -0,0 +1,1648 @@ +// Package jsonrepair provides utilities to repair malformed JSON. +package jsonrepair + +import ( + "bytes" + "encoding/json" + "errors" + "reflect" + "slices" + "strconv" + "strings" + "unicode" + "unicode/utf16" +) + +// Option is a function that configures the JSON repairer. +type Option func(*options) + +type options struct { + ensureASCII *bool + skipJSONLoads bool + streamStable bool + strict bool +} + +// LogEntry represents a log entry with context and text. +type LogEntry struct { + Context string `json:"context"` + Text string `json:"text"` +} + +type numberValue struct { + raw string +} + +type objectEntry struct { + key string + value any +} + +type orderedObject struct { + entries []objectEntry + index map[string]int +} + +func newOrderedObject() *orderedObject { + return &orderedObject{index: map[string]int{}} +} + +func (o *orderedObject) set(key string, value any) { + if idx, ok := o.index[key]; ok { + o.entries[idx].value = value + return + } + o.index[key] = len(o.entries) + o.entries = append(o.entries, objectEntry{key: key, value: value}) +} + +func (o *orderedObject) get(key string) (any, bool) { + idx, ok := o.index[key] + if !ok { + return nil, false + } + return o.entries[idx].value, true +} + +func (o *orderedObject) lastKey() (string, bool) { + if len(o.entries) == 0 { + return "", false + } + return o.entries[len(o.entries)-1].key, true +} + +func (o *orderedObject) hasKey(key string) bool { + _, ok := o.index[key] + return ok +} + +func (o *orderedObject) merge(other *orderedObject) { + for _, entry := range other.entries { + o.set(entry.key, entry.value) + } +} + +type contextValue int + +const ( + contextObjectKey contextValue = iota + contextObjectValue + contextArray +) + +type jsonContext struct { + context []contextValue + current *contextValue + empty bool +} + +func newJSONContext() *jsonContext { + return &jsonContext{empty: true} +} + +func (c *jsonContext) set(value contextValue) { + c.context = append(c.context, value) + c.current = &c.context[len(c.context)-1] + c.empty = false +} + +func (c *jsonContext) reset() { + if len(c.context) > 0 { + c.context = c.context[:len(c.context)-1] + } + if len(c.context) == 0 { + c.current = nil + c.empty = true + return + } + c.current = &c.context[len(c.context)-1] +} + +func (c *jsonContext) contains(value contextValue) bool { + return slices.Contains(c.context, value) +} + +type parser struct { + jsonStr []rune + index int + context *jsonContext + logging bool + logger []LogEntry + streamStable bool + strict bool + log func(string) +} + +func newParser(input string, logging bool, streamStable bool, strict bool) *parser { + p := &parser{ + jsonStr: []rune(input), + context: newJSONContext(), + logging: logging, + streamStable: streamStable, + strict: strict, + } + if logging { + p.log = p.addLog + } else { + p.log = func(string) {} + } + return p +} + +func (p *parser) addLog(text string) { + window := 10 + start := max(p.index-window, 0) + end := min(p.index+window, len(p.jsonStr)) + context := string(p.jsonStr[start:end]) + p.logger = append(p.logger, LogEntry{Text: text, Context: context}) +} + +func (p *parser) parse() (any, []LogEntry, error) { + jsonValue, err := p.parseJSON() + if err != nil { + return nil, nil, err + } + if p.index < len(p.jsonStr) { + p.log("The parser returned early, checking if there's more json elements") + values := []any{jsonValue} + for p.index < len(p.jsonStr) { + p.context.reset() + j, parseErr := p.parseJSON() + if parseErr != nil { + return nil, nil, parseErr + } + if isTruthy(j) { + if len(values) > 0 && isSameObject(values[len(values)-1], j) { + values = values[:len(values)-1] + } else if len(values) > 0 && !isTruthy(values[len(values)-1]) { + values = values[:len(values)-1] + } + values = append(values, j) + } else { + if len(values) > 1 { + _, ok := p.getCharAt(0) + if !ok { + break + } + if len(values) > 1 { + values = values[:len(values)-1] + } + p.index = len(p.jsonStr) + break + } + p.index++ + } + } + if len(values) == 1 { + p.log("There were no more elements, returning the element without the array") + jsonValue = values[0] + } else if p.strict { + p.log("Multiple top-level JSON elements found in strict mode, raising an error") + return nil, nil, errors.New("multiple top-level JSON elements found in strict mode") + } else { + jsonValue = values + } + } + return jsonValue, p.logger, nil +} + +func (p *parser) parseJSON() (any, error) { + for { + char, ok := p.getCharAt(0) + if !ok { + return "", nil + } + if char == '{' { + p.index++ + return p.parseObject() + } + if char == '[' { + p.index++ + return p.parseArray() + } + if !p.context.empty && (isStringDelimiter(char) || unicode.IsLetter(char)) { + return p.parseString() + } + if !p.context.empty && (unicode.IsDigit(char) || char == '-' || char == '.') { + return p.parseNumber() + } + if p.context.empty && (unicode.IsDigit(char) || char == '-' || char == '.') { + if onlyWhitespaceBefore(p) { + return p.parseNumber() + } + } + if char == '#' || char == '/' { + return p.parseComment() + } + if !p.context.empty && (char == 't' || char == 'f' || char == 'n') { + value := p.parseBooleanOrNull() + if value != "" { + return value, nil + } + return p.parseString() + } + if p.context.empty && (char == 't' || char == 'f' || char == 'n') { + if onlyWhitespaceBefore(p) { + value := p.parseBooleanOrNull() + if value != "" { + return value, nil + } + } + } + if p.context.empty && char == ':' { + return "", nil + } + p.index++ + } +} + +func (p *parser) getCharAt(offset int) (rune, bool) { + idx := p.index + offset + if idx < 0 || idx >= len(p.jsonStr) { + return 0, false + } + return p.jsonStr[idx], true +} + +func (p *parser) skipWhitespaces() { + for { + char, ok := p.getCharAt(0) + if !ok || !unicode.IsSpace(char) { + return + } + p.index++ + } +} + +func (p *parser) scrollWhitespaces(idx int) int { + for { + char, ok := p.getCharAt(idx) + if !ok || !unicode.IsSpace(char) { + return idx + } + idx++ + } +} + +func (p *parser) skipToCharacter(character rune, idx int) int { + targets := map[rune]struct{}{character: {}} + return p.skipToCharacters(targets, idx) +} + +func (p *parser) skipToCharacters(targets map[rune]struct{}, idx int) int { + i := p.index + idx + backslashes := 0 + for i < len(p.jsonStr) { + ch := p.jsonStr[i] + if ch == '\\' { + backslashes++ + i++ + continue + } + if _, ok := targets[ch]; ok && backslashes%2 == 0 { + return i - p.index + } + backslashes = 0 + i++ + } + return len(p.jsonStr) - p.index +} + +func (p *parser) parseArray() (any, error) { + arr := []any{} + p.context.set(contextArray) + char, ok := p.getCharAt(0) + for ok && char != ']' && char != '}' { + p.skipWhitespaces() + var value any + if isStringDelimiter(char) { + i := 1 + i = p.skipToCharacter(char, i) + i = p.scrollWhitespaces(i + 1) + if nextChar, ok := p.getCharAt(i); ok && nextChar == ':' { + value, _ = p.parseObject() + } else { + value, _ = p.parseString() + } + } else { + var err error + value, err = p.parseJSON() + if err != nil { + return nil, err + } + } + + if isStrictlyEmpty(value) { + if nextChar, ok := p.getCharAt(0); !ok || (nextChar != ']' && nextChar != ',') { + p.index++ + } else { + arr = append(arr, value) + } + } else if strVal, ok := value.(string); ok && strVal == "..." { + if prev, ok := p.getCharAt(-1); ok && prev == '.' { + p.log("While parsing an array, found a stray '...'; ignoring it") + } else { + arr = append(arr, value) + } + } else { + arr = append(arr, value) + } + + char, ok = p.getCharAt(0) + for ok && char != ']' && (unicode.IsSpace(char) || char == ',') { + p.index++ + char, ok = p.getCharAt(0) + } + } + + if char != ']' { + p.log("While parsing an array we missed the closing ], ignoring it") + } + + p.index++ + p.context.reset() + return arr, nil +} + +func (p *parser) parseComment() (any, error) { + char, ok := p.getCharAt(0) + if !ok { + return "", nil + } + termination := map[rune]struct{}{'\n': {}, '\r': {}} + if p.context.contains(contextArray) { + termination[']'] = struct{}{} + } + if p.context.contains(contextObjectValue) { + termination['}'] = struct{}{} + } + if p.context.contains(contextObjectKey) { + termination[':'] = struct{}{} + } + if char == '#' { + comment := []rune{} + for ok { + if _, hit := termination[char]; hit { + break + } + comment = append(comment, char) + p.index++ + char, ok = p.getCharAt(0) + } + p.log("Found line comment: " + string(comment) + ", ignoring") + } else if char == '/' { + nextChar, ok := p.getCharAt(1) + if ok && nextChar == '/' { + comment := []rune{'/', '/'} + p.index += 2 + char, ok = p.getCharAt(0) + for ok { + if _, hit := termination[char]; hit { + break + } + comment = append(comment, char) + p.index++ + char, ok = p.getCharAt(0) + } + p.log("Found line comment: " + string(comment) + ", ignoring") + } else if ok && nextChar == '*' { + comment := []rune{'/', '*'} + p.index += 2 + for { + char, ok = p.getCharAt(0) + if !ok { + p.log("Reached end-of-string while parsing block comment; unclosed block comment.") + break + } + comment = append(comment, char) + p.index++ + if len(comment) >= 2 && comment[len(comment)-2] == '*' && comment[len(comment)-1] == '/' { + break + } + } + p.log("Found block comment: " + string(comment) + ", ignoring") + } else { + p.index++ + } + } + if p.context.empty { + return p.parseJSON() + } + return "", nil +} + +func (p *parser) parseNumber() (any, error) { + numberChars := "0123456789-.eE/,_" + numberStr := "" + char, ok := p.getCharAt(0) + isArray := p.context.current != nil && *p.context.current == contextArray + for ok && strings.ContainsRune(numberChars, char) && (!isArray || char != ',' || strings.Contains(numberStr, "/")) { + if char != '_' { + numberStr += string(char) + } + p.index++ + char, ok = p.getCharAt(0) + } + if nextChar, ok := p.getCharAt(0); ok && unicode.IsLetter(nextChar) { + p.index -= len([]rune(numberStr)) + return p.parseString() + } + if len(numberStr) > 0 { + last := numberStr[len(numberStr)-1] + if last == '-' || last == 'e' || last == 'E' || last == '/' || last == ',' { + numberStr = numberStr[:len(numberStr)-1] + p.index-- + } + } + if strings.Contains(numberStr, "/") || strings.Contains(numberStr, "-") || strings.Contains(numberStr, ",") { + if numberStr == "-" { + return "", nil + } + if strings.ContainsAny(numberStr, "eE") { + floatVal, err := strconv.ParseFloat(numberStr, 64) + if err == nil { + formatted := formatFloat(floatVal) + return numberValue{raw: formatted}, nil + } + return numberStr, nil + } + return numberStr, nil + } + if strings.ContainsAny(numberStr, ".eE") { + floatVal, err := strconv.ParseFloat(numberStr, 64) + if err == nil { + formatted := formatFloat(floatVal) + return numberValue{raw: formatted}, nil + } + return numberStr, nil + } + if numberStr == "" { + return "", nil + } + return numberValue{raw: numberStr}, nil +} + +func (p *parser) parseObject() (any, error) { + obj := newOrderedObject() + startIndex := p.index + for { + p.skipWhitespaces() + char, ok := p.getCharAt(0) + if !ok || char == '}' { + break + } + if current, ok := p.getCharAt(0); ok && current == ':' { + p.log("While parsing an object we found a : before a key, ignoring") + p.index++ + } + p.context.set(contextObjectKey) + rollbackIndex := p.index + key := "" + for { + current, ok := p.getCharAt(0) + if !ok { + break + } + rollbackIndex = p.index + if current == '[' && key == "" { + prevKey, ok := obj.lastKey() + if ok { + prevValue, _ := obj.get(prevKey) + if prevArray, ok := prevValue.([]any); ok && !p.strict { + p.index++ + newArrayValue, err := p.parseArray() + if err != nil { + return nil, err + } + if newArray, ok := newArrayValue.([]any); ok { + listLengths := []int{} + for _, item := range prevArray { + if nested, ok := item.([]any); ok { + listLengths = append(listLengths, len(nested)) + } + } + expectedLen := 0 + if len(listLengths) > 0 { + same := true + for _, length := range listLengths { + if length != listLengths[0] { + same = false + break + } + } + if same { + expectedLen = listLengths[0] + } + } + if expectedLen > 0 { + tail := []any{} + for len(prevArray) > 0 { + if _, ok := prevArray[len(prevArray)-1].([]any); ok { + break + } + tail = append(tail, prevArray[len(prevArray)-1]) + prevArray = prevArray[:len(prevArray)-1] + } + if len(tail) > 0 { + reverseAny(tail) + if len(tail)%expectedLen == 0 { + p.log("While parsing an object we found row values without an inner array, grouping them into rows") + for i := 0; i < len(tail); i += expectedLen { + prevArray = append(prevArray, tail[i:i+expectedLen]) + } + } else { + prevArray = append(prevArray, tail...) + } + } + if len(newArray) > 0 { + allLists := true + for _, item := range newArray { + if _, ok := item.([]any); !ok { + allLists = false + break + } + } + if allLists { + p.log("While parsing an object we found additional rows, appending them without flattening") + prevArray = append(prevArray, newArray...) + } else { + prevArray = append(prevArray, newArray) + } + } + } else { + if len(newArray) == 1 { + if nested, ok := newArray[0].([]any); ok { + prevArray = append(prevArray, nested...) + } else { + prevArray = append(prevArray, newArray...) + } + } else { + prevArray = append(prevArray, newArray...) + } + } + obj.set(prevKey, prevArray) + } + p.skipWhitespaces() + if nextChar, ok := p.getCharAt(0); ok && nextChar == ',' { + p.index++ + } + p.skipWhitespaces() + continue + } + } + } + rawKeyValue, err := p.parseString() + if err != nil { + return nil, err + } + rawKey, _ := rawKeyValue.(string) + key = rawKey + if key == "" { + p.skipWhitespaces() + } + if key != "" || (key == "" && func() bool { ch, ok := p.getCharAt(0); return ok && (ch == ':' || ch == '}') }()) { + if key == "" && p.strict { + p.log("Empty key found in strict mode while parsing object, raising an error") + return nil, errors.New("empty key found in strict mode while parsing object") + } + break + } + } + if p.context.contains(contextArray) && obj.hasKey(key) { + if p.strict { + p.log("Duplicate key found in strict mode while parsing object, raising an error") + return nil, errors.New("duplicate key found in strict mode while parsing object") + } + p.log("While parsing an object we found a duplicate key, closing the object here and rolling back the index") + p.index = rollbackIndex - 1 + p.insertRune(p.index+1, '{') + break + } + p.skipWhitespaces() + if current, ok := p.getCharAt(0); !ok || current == '}' { + continue + } + p.skipWhitespaces() + if current, ok := p.getCharAt(0); ok && current != ':' { + if p.strict { + p.log("Missing ':' after key in strict mode while parsing object, raising an error") + return nil, errors.New("missing ':' after key in strict mode while parsing object") + } + p.log("While parsing an object we missed a : after a key") + } + p.index++ + p.context.reset() + p.context.set(contextObjectValue) + p.skipWhitespaces() + value := any("") + if current, ok := p.getCharAt(0); ok && (current == ',' || current == '}') { + p.log("While parsing an object value we found a stray " + string(current) + ", ignoring it") + } else { + var err error + value, err = p.parseJSON() + if err != nil { + return nil, err + } + } + if value == "" && p.strict { + if prev, ok := p.getCharAt(-1); !ok || !isStringDelimiter(prev) { + p.log("Parsed value is empty in strict mode while parsing object, raising an error") + return nil, errors.New("parsed value is empty in strict mode while parsing object") + } + } + p.context.reset() + obj.set(key, value) + if current, ok := p.getCharAt(0); ok && (current == ',' || current == '\'' || current == '"') { + p.index++ + } + if current, ok := p.getCharAt(0); ok && current == ']' && p.context.contains(contextArray) { + p.log("While parsing an object we found a closing array bracket, closing the object here and rolling back the index") + p.index-- + break + } + p.skipWhitespaces() + } + p.index++ + if len(obj.entries) == 0 && p.index-startIndex > 2 { + if p.strict { + p.log("Parsed object is empty but contains extra characters in strict mode, raising an error") + return nil, errors.New("parsed object is empty but contains extra characters in strict mode") + } + if p.context.empty && p.index-startIndex <= 3 { + return obj, nil + } + if p.context.empty { + prefix := string(p.jsonStr[:startIndex-1]) + if strings.TrimSpace(prefix) == "" { + return obj, nil + } + } + p.log("Parsed object is empty, we will try to parse this as an array instead") + p.index = startIndex + return p.parseArray() + } + if len(obj.entries) == 0 && p.index-startIndex <= 2 { + return obj, nil + } + if !p.context.empty { + if current, ok := p.getCharAt(0); ok && current == '}' { + if p.context.current == nil || (*p.context.current != contextObjectKey && *p.context.current != contextObjectValue) { + p.log("Found an extra closing brace that shouldn't be there, skipping it") + p.index++ + } + } + return obj, nil + } + p.skipWhitespaces() + if current, ok := p.getCharAt(0); !ok || current != ',' { + return obj, nil + } + p.index++ + p.skipWhitespaces() + if current, ok := p.getCharAt(0); !ok || !isStringDelimiter(current) { + return obj, nil + } + if !p.strict { + p.log("Found a comma and string delimiter after object closing brace, checking for additional key-value pairs") + additionalValue, err := p.parseObject() + if err != nil { + return nil, err + } + if additionalObj, ok := additionalValue.(*orderedObject); ok { + obj.merge(additionalObj) + } + } + return obj, nil +} + +func (p *parser) parseString() (any, error) { + missingQuotes := false + doubledQuotes := false + ldelim := '"' + rdelim := '"' + + char, ok := p.getCharAt(0) + if ok && (char == '#' || char == '/') { + return p.parseComment() + } + for ok && !isStringDelimiter(char) && !isAlphaNum(char) { + p.index++ + char, ok = p.getCharAt(0) + } + if !ok { + return "", nil + } + if char == '\'' { + ldelim = '\'' + rdelim = '\'' + } else if char == '“' { + ldelim = '“' + rdelim = '”' + } else if isAlphaNum(char) { + if (char == 't' || char == 'f' || char == 'n') && (p.context.current == nil || *p.context.current != contextObjectKey) { + value := p.parseBooleanOrNull() + if value != "" { + return value, nil + } + } + if (char == 'T' || char == 'F' || char == 'N') && (p.context.current == nil || *p.context.current != contextObjectKey) { + value := p.parseBooleanOrNull() + if value != "" { + return value, nil + } + } + p.log("While parsing a string, we found a literal instead of a quote") + missingQuotes = true + } + + if !missingQuotes { + p.index++ + } + if next, ok := p.getCharAt(0); ok && next == '`' { + if value, ok := p.parseJSONLLMBlock(); ok { + return value, nil + } + if p.context.empty { + return "", nil + } + p.log("While parsing a string, we found code fences but they did not enclose valid JSON, continuing parsing the string") + } + + if next, ok := p.getCharAt(0); ok && next == ldelim { + if (p.context.current != nil && *p.context.current == contextObjectKey && func() bool { ch, ok := p.getCharAt(1); return ok && ch == ':' }()) || + (p.context.current != nil && *p.context.current == contextObjectValue && func() bool { ch, ok := p.getCharAt(1); return ok && (ch == ',' || ch == '}') }()) || + (p.context.current != nil && *p.context.current == contextArray && func() bool { ch, ok := p.getCharAt(1); return ok && (ch == ',' || ch == ']') }()) { + p.index++ + return "", nil + } + if p.context.current != nil && *p.context.current == contextObjectKey { + i := p.scrollWhitespaces(1) + if ch, ok := p.getCharAt(i); ok && ch == ':' { + p.index++ + return "", nil + } + } + if next2, ok := p.getCharAt(1); ok && next2 == ldelim { + p.log("While parsing a string, we found a doubled quote and then a quote again, ignoring it") + if p.strict { + return nil, errors.New("found doubled quotes followed by another quote") + } + return "", nil + } + i := p.skipToCharacter(rdelim, 1) + if nextChar, ok := p.getCharAt(i + 1); ok && nextChar == rdelim { + p.log("While parsing a string, we found a valid starting doubled quote") + doubledQuotes = true + p.index++ + } else { + i = p.scrollWhitespaces(1) + nextChar, ok := p.getCharAt(i) + if ok && (isStringDelimiter(nextChar) || nextChar == '{' || nextChar == '[') { + p.log("While parsing a string, we found a doubled quote but also another quote afterwards, ignoring it") + if p.strict { + return nil, errors.New("found doubled quotes followed by another quote while parsing a string") + } + p.index++ + return "", nil + } + if !ok || (nextChar != ',' && nextChar != ']' && nextChar != '}') { + p.log("While parsing a string, we found a doubled quote but it was a mistake, removing one quote") + p.index++ + } + } + } + + stringAcc := []rune{} + char, ok = p.getCharAt(0) + unmatchedDelimiter := false + for ok && char != rdelim { + if missingQuotes { + if p.context.current != nil && *p.context.current == contextObjectKey { + if char == ':' || unicode.IsSpace(char) { + p.log("While parsing a string missing the left delimiter in object key context, we found a :, stopping here") + break + } + } + if p.context.current != nil && *p.context.current == contextArray { + if char == ']' || char == ',' { + p.log("While parsing a string missing the left delimiter in array context, we found a ] or ,, stopping here") + break + } + } + } + if !p.streamStable && p.context.current != nil && *p.context.current == contextObjectValue { + if (char == ',' || char == '}') && (len(stringAcc) == 0 || stringAcc[len(stringAcc)-1] != rdelim) { + rstringDelimiterMissing := true + next := rune(0) + p.skipWhitespaces() + if next, ok := p.getCharAt(1); ok && next == '\\' { + rstringDelimiterMissing = false + } + i := p.skipToCharacter(rdelim, 1) + if _, ok := p.getCharAt(i); ok { + i++ + i = p.scrollWhitespaces(i) + next, _ = p.getCharAt(i) + if next == ',' || next == '}' { + rstringDelimiterMissing = false + } else { + i = p.skipToCharacter(ldelim, i) + if _, ok := p.getCharAt(i); !ok { + rstringDelimiterMissing = false + } else { + i = p.scrollWhitespaces(i + 1) + next, _ = p.getCharAt(i) + if next != ':' { + rstringDelimiterMissing = false + } + } + } + } else { + i = p.skipToCharacter(':', 1) + if _, ok := p.getCharAt(i); ok { + break + } + i = p.scrollWhitespaces(1) + j := p.skipToCharacter('}', i) + if j-i > 1 { + rstringDelimiterMissing = false + } else if _, ok := p.getCharAt(j); ok { + for k := len(stringAcc) - 1; k >= 0; k-- { + if stringAcc[k] == '{' { + rstringDelimiterMissing = false + break + } + } + } + } + if rstringDelimiterMissing { + p.log("While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here") + break + } + } + } + if !p.streamStable && p.context.contains(contextArray) && char == ']' { + i := p.skipToCharacter(rdelim, 0) + if _, ok := p.getCharAt(i); !ok { + break + } + } + if p.context.current != nil && *p.context.current == contextObjectValue && char == '}' { + i := p.scrollWhitespaces(1) + nextChar, ok := p.getCharAt(i) + if ok && nextChar == '`' { + if c1, ok := p.getCharAt(i + 1); ok && c1 == '`' { + if c2, ok := p.getCharAt(i + 2); ok && c2 == '`' { + p.log("While parsing a string in object value context, we found a } that closes the object before code fences, stopping here") + break + } + } + } + if !ok { + p.log("While parsing a string in object value context, we found a } that closes the object, stopping here") + break + } + } + stringAcc = append(stringAcc, char) + p.index++ + char, ok = p.getCharAt(0) + if !ok { + if p.streamStable && len(stringAcc) > 0 && stringAcc[len(stringAcc)-1] == '\\' { + stringAcc = stringAcc[:len(stringAcc)-1] + } + break + } + if len(stringAcc) > 0 && stringAcc[len(stringAcc)-1] == '\\' { + p.log("Found a stray escape sequence, normalizing it") + if char == rdelim || char == 't' || char == 'n' || char == 'r' || char == 'b' || char == '\\' { + stringAcc = stringAcc[:len(stringAcc)-1] + escapeSeqs := map[rune]rune{'t': '\t', 'n': '\n', 'r': '\r', 'b': '\b'} + if replacement, ok := escapeSeqs[char]; ok { + stringAcc = append(stringAcc, replacement) + } else { + stringAcc = append(stringAcc, char) + } + p.index++ + char, ok = p.getCharAt(0) + for ok && len(stringAcc) > 0 && stringAcc[len(stringAcc)-1] == '\\' && (char == rdelim || char == '\\') { + stringAcc = append(stringAcc[:len(stringAcc)-1], char) + p.index++ + char, ok = p.getCharAt(0) + } + continue + } + if char == 'u' || char == 'x' { + numChars := 4 + if char == 'x' { + numChars = 2 + } + nextChars := p.sliceRunes(p.index+1, p.index+1+numChars) + if len(nextChars) == numChars && isHexString(string(nextChars)) { + p.log("Found a unicode escape sequence, normalizing it") + parsed, _ := strconv.ParseInt(string(nextChars), 16, 32) + stringAcc = append(stringAcc[:len(stringAcc)-1], rune(parsed)) + p.index += 1 + numChars + char, ok = p.getCharAt(0) + continue + } + } else if isStringDelimiter(char) && char != rdelim { + p.log("Found a delimiter that was escaped but shouldn't be escaped, removing the escape") + stringAcc = append(stringAcc[:len(stringAcc)-1], char) + p.index++ + char, ok = p.getCharAt(0) + continue + } + } + if char == ':' && !missingQuotes && p.context.current != nil && *p.context.current == contextObjectKey { + i := p.skipToCharacter(ldelim, 1) + if _, ok := p.getCharAt(i); ok { + i++ + i = p.skipToCharacter(rdelim, i) + if _, ok := p.getCharAt(i); ok { + i++ + i = p.scrollWhitespaces(i) + ch, ok := p.getCharAt(i) + if ok && (ch == ',' || ch == '}') { + p.log("While parsing a string missing the right delimiter in object key context, we found a " + string(ch) + " stopping here") + break + } + } + } else { + p.log("While parsing a string missing the right delimiter in object key context, we found a :, stopping here") + break + } + } + if char == rdelim && (len(stringAcc) == 0 || stringAcc[len(stringAcc)-1] != '\\') { + if doubledQuotes { + if next, ok := p.getCharAt(1); ok && next == rdelim { + p.log("While parsing a string, we found a doubled quote, ignoring it") + p.index++ + } + } else if missingQuotes && p.context.current != nil && *p.context.current == contextObjectValue { + i := 1 + nextChar, ok := p.getCharAt(i) + for ok && nextChar != rdelim && nextChar != ldelim { + i++ + nextChar, ok = p.getCharAt(i) + } + if ok { + i++ + i = p.scrollWhitespaces(i) + if ch, ok := p.getCharAt(i); ok && ch == ':' { + p.index-- + char, _ = p.getCharAt(0) + p.log("In a string with missing quotes and object value context, I found a delimeter but it turns out it was the beginning on the next key. Stopping here.") + break + } + } + } else if unmatchedDelimiter { + unmatchedDelimiter = false + stringAcc = append(stringAcc, char) + p.index++ + char, ok = p.getCharAt(0) + } else { + i := 1 + nextChar, ok := p.getCharAt(i) + checkCommaInObjectValue := true + for ok && nextChar != rdelim && nextChar != ldelim { + if checkCommaInObjectValue && unicode.IsLetter(nextChar) { + checkCommaInObjectValue = false + } + if (p.context.contains(contextObjectKey) && (nextChar == ':' || nextChar == '}')) || + (p.context.contains(contextObjectValue) && nextChar == '}') || + (p.context.contains(contextArray) && (nextChar == ']' || nextChar == ',')) || + (checkCommaInObjectValue && p.context.current != nil && *p.context.current == contextObjectValue && nextChar == ',') { + break + } + i++ + nextChar, ok = p.getCharAt(i) + } + if nextChar == ',' && p.context.current != nil && *p.context.current == contextObjectValue { + i++ + i = p.skipToCharacter(rdelim, i) + i++ + i = p.scrollWhitespaces(i) + nextChar, _ = p.getCharAt(i) + if nextChar == '}' || nextChar == ',' { + p.log("While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it") + stringAcc = append(stringAcc, char) + p.index++ + char, ok = p.getCharAt(0) + if !ok { + break + } + continue + } + } else if nextChar == rdelim && func() bool { prev, ok := p.getCharAt(i - 1); return ok && prev != '\\' }() { + if onlyWhitespaceUntil(p, i) { + break + } + if p.context.current != nil && *p.context.current == contextObjectValue { + i = p.scrollWhitespaces(i + 1) + if ch, ok := p.getCharAt(i); ok && ch == ',' { + i = p.skipToCharacter(ldelim, i+1) + i++ + i = p.skipToCharacter(rdelim, i+1) + i++ + i = p.scrollWhitespaces(i) + if ch, ok := p.getCharAt(i); ok && ch == ':' { + p.log("While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it") + stringAcc = append(stringAcc, char) + p.index++ + char, ok = p.getCharAt(0) + if !ok { + break + } + continue + } + } + i = p.skipToCharacter(rdelim, i+1) + i++ + nextChar, ok = p.getCharAt(i) + for ok && nextChar != ':' { + if nextChar == ',' || nextChar == ']' || nextChar == '}' || (nextChar == rdelim && func() bool { prev, ok := p.getCharAt(i - 1); return ok && prev != '\\' }()) { + break + } + i++ + nextChar, ok = p.getCharAt(i) + } + if nextChar != ':' { + p.log("While parsing a string, we found a misplaced quote that would have closed the string but has a different meaning here, ignoring it") + unmatchedDelimiter = !unmatchedDelimiter + stringAcc = append(stringAcc, char) + p.index++ + char, ok = p.getCharAt(0) + if !ok { + break + } + } + } else if p.context.current != nil && *p.context.current == contextArray { + evenDelimiters := nextChar == rdelim + for nextChar == rdelim { + i = p.skipToCharacters(map[rune]struct{}{rdelim: {}, ']': {}}, i+1) + nextChar, ok = p.getCharAt(i) + if !ok || nextChar != rdelim { + evenDelimiters = false + break + } + i = p.skipToCharacters(map[rune]struct{}{rdelim: {}, ']': {}}, i+1) + nextChar, _ = p.getCharAt(i) + } + if evenDelimiters { + p.log("While parsing a string in Array context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it") + unmatchedDelimiter = !unmatchedDelimiter + stringAcc = append(stringAcc, char) + p.index++ + char, ok = p.getCharAt(0) + if !ok { + break + } + } else { + break + } + } else if p.context.current != nil && *p.context.current == contextObjectKey { + p.log("While parsing a string in Object Key context, we detected a quoted section that would have closed the string but has a different meaning here, ignoring it") + stringAcc = append(stringAcc, char) + p.index++ + char, ok = p.getCharAt(0) + if !ok { + break + } + } + } + } + } + } + if ok && missingQuotes && p.context.current != nil && *p.context.current == contextObjectKey && unicode.IsSpace(char) { + p.log("While parsing a string, handling an extreme corner case in which the LLM added a comment instead of valid string, invalidate the string and return an empty value") + p.skipWhitespaces() + if ch, ok := p.getCharAt(0); ok { + if ch != ':' && ch != ',' { + p.index-- + return "", nil + } + if ch == ',' { + p.index-- + return "", nil + } + } + } + if missingQuotes && p.context.current != nil && *p.context.current == contextObjectKey { + if !onlyWhitespaceUntil(p, p.scrollWhitespaces(0)) { + stringAcc = trimRightWhitespace(stringAcc) + if len(stringAcc) == 0 { + return "", nil + } + } + } + if !ok || char != rdelim { + if !p.streamStable { + p.log("While parsing a string, we missed the closing quote, ignoring") + stringAcc = trimRightWhitespace(stringAcc) + } + } else { + p.index++ + } + if !p.streamStable && (missingQuotes || (len(stringAcc) > 0 && stringAcc[len(stringAcc)-1] == '\n')) { + stringAcc = trimRightWhitespace(stringAcc) + } + if missingQuotes && p.context.empty { + next := p.scrollWhitespaces(0) + if ch, ok := p.getCharAt(next); ok && (ch == '{' || ch == '[' || ch == '`') { + return "", nil + } + if !p.streamStable { + stringAcc = trimRightWhitespace(stringAcc) + } + if len(stringAcc) == 0 { + return "", nil + } + } + if p.context.empty { + next := p.scrollWhitespaces(0) + if ch, ok := p.getCharAt(next); ok && (ch == '{' || ch == '[' || ch == '`') { + return "", nil + } + } + if len(stringAcc) == 1 && stringAcc[0] == rdelim { + return "", nil + } + if p.context.empty && missingQuotes { + if len(stringAcc) == 1 && stringAcc[0] == '"' { + return "", nil + } + } + return string(stringAcc), nil +} + +func (p *parser) parseBooleanOrNull() any { + char, ok := p.getCharAt(0) + if !ok { + return "" + } + valueMap := map[rune]struct { + token string + value any + }{ + 't': {"true", true}, + 'f': {"false", false}, + 'n': {"null", nil}, + } + lower := unicode.ToLower(char) + value, ok := valueMap[lower] + if !ok { + return "" + } + matchUpper := unicode.IsUpper(char) + i := 0 + startingIndex := p.index + current := lower + for ok && i < len(value.token) && current == rune(value.token[i]) { + i++ + p.index++ + char, ok = p.getCharAt(0) + if ok { + if unicode.IsUpper(char) { + matchUpper = true + } + current = unicode.ToLower(char) + } + } + if i == len(value.token) { + if matchUpper && p.context.empty { + p.index = startingIndex + return "" + } + return value.value + } + p.index = startingIndex + return "" +} + +func (p *parser) parseJSONLLMBlock() (any, bool) { + if p.sliceString(p.index, p.index+7) == "```json" { + i := p.skipToCharacter('`', 7) + if p.sliceString(p.index+i, p.index+i+3) == "```" { + p.index += 7 + value, err := p.parseJSON() + if err != nil { + return nil, false + } + return value, true + } + } + return nil, false +} + +func (p *parser) sliceRunes(start int, end int) []rune { + if start < 0 { + start = 0 + } + if end > len(p.jsonStr) { + end = len(p.jsonStr) + } + if start > end { + return []rune{} + } + return p.jsonStr[start:end] +} + +func (p *parser) sliceString(start int, end int) string { + return string(p.sliceRunes(start, end)) +} + +func (p *parser) insertRune(pos int, r rune) { + if pos < 0 { + pos = 0 + } + if pos > len(p.jsonStr) { + pos = len(p.jsonStr) + } + p.jsonStr = append(p.jsonStr[:pos], append([]rune{r}, p.jsonStr[pos:]...)...) +} + +func onlyWhitespaceUntil(p *parser, end int) bool { + for j := 1; j < end; j++ { + c, ok := p.getCharAt(j) + if ok && !unicode.IsSpace(c) { + return false + } + } + return true +} + +func onlyWhitespaceBefore(p *parser) bool { + for i := p.index - 1; i >= 0; i-- { + c := p.jsonStr[i] + if !unicode.IsSpace(c) { + return false + } + } + return true +} + +func reverseAny(values []any) { + for i, j := 0, len(values)-1; i < j; i, j = i+1, j-1 { + values[i], values[j] = values[j], values[i] + } +} + +func isStrictlyEmpty(value any) bool { + switch v := value.(type) { + case string: + return len(v) == 0 + case []any: + return len(v) == 0 + case *orderedObject: + return len(v.entries) == 0 + default: + return false + } +} + +func isSameObject(obj1 any, obj2 any) bool { + switch v1 := obj1.(type) { + case *orderedObject: + v2, ok := obj2.(*orderedObject) + if !ok { + return false + } + if len(v1.entries) != len(v2.entries) { + return false + } + for _, entry := range v1.entries { + val2, ok := v2.get(entry.key) + if !ok { + return false + } + if !isSameObject(entry.value, val2) { + return false + } + } + return true + case []any: + v2, ok := obj2.([]any) + if !ok { + return false + } + if len(v1) != len(v2) { + return false + } + for i := range v1 { + if !isSameObject(v1[i], v2[i]) { + return false + } + } + return true + default: + if obj1 == nil || obj2 == nil { + return obj1 == obj2 + } + return reflect.TypeOf(obj1) == reflect.TypeOf(obj2) + } +} + +func isTruthy(value any) bool { + switch v := value.(type) { + case string: + return v != "" + case []any: + return len(v) > 0 + case *orderedObject: + return len(v.entries) > 0 + case bool: + return v + case numberValue: + return v.raw != "" + case nil: + return false + default: + return true + } +} + +func isStringDelimiter(char rune) bool { + switch char { + case '"', '\'', '“', '”': + return true + default: + return false + } +} + +func isAlphaNum(char rune) bool { + return unicode.IsLetter(char) || unicode.IsDigit(char) +} + +func trimRightWhitespace(values []rune) []rune { + for len(values) > 0 { + if !unicode.IsSpace(values[len(values)-1]) { + break + } + values = values[:len(values)-1] + } + return values +} + +func isHexString(value string) bool { + if value == "" { + return false + } + for _, c := range value { + if (c < '0' || c > '9') && (c < 'a' || c > 'f') && (c < 'A' || c > 'F') { + return false + } + } + return true +} + +func formatFloat(value float64) string { + formatted := strconv.FormatFloat(value, 'f', -1, 64) + if !strings.Contains(formatted, ".") { + formatted += ".0" + } + return formatted +} + +func applyOptions(opts []Option) options { + cfg := options{} + for _, opt := range opts { + if opt != nil { + opt(&cfg) + } + } + return cfg +} + +func ensureASCIIValue(cfg options) bool { + if cfg.ensureASCII == nil { + return true + } + return *cfg.ensureASCII +} + +// WithEnsureASCII sets whether to escape non-ASCII characters. +func WithEnsureASCII(value bool) Option { + return func(o *options) { + o.ensureASCII = &value + } +} + +// WithSkipJSONLoads skips JSON parsing during load. +func WithSkipJSONLoads() Option { + return func(o *options) { + o.skipJSONLoads = true + } +} + +// WithStreamStable enables streaming-stable parsing. +func WithStreamStable() Option { + return func(o *options) { + o.streamStable = true + } +} + +// WithStrict enables strict parsing mode. +func WithStrict() Option { + return func(o *options) { + o.strict = true + } +} + +// RepairJSON takes a potentially malformed JSON string output from LLMs and +// attempts to repair it into a valid JSON string. It returns the repaired JSON +// string or an error if the input cannot be repaired. +func RepairJSON(input string, opts ...Option) (string, error) { + cfg := applyOptions(opts) + p := newParser(input, false, cfg.streamStable, cfg.strict) + value, _, err := p.parse() + if err != nil { + return "", err + } + if str, ok := value.(string); ok { + trimmed := strings.TrimSpace(str) + if str == "" || trimmed == "" { + return "", nil + } + return "", nil + } + if value == "" { + return "", nil + } + return serialize(value, ensureASCIIValue(cfg)), nil +} + +// Loads takes a potentially malformed JSON string output from LLMs and attempts +// to repair it and parse it into a Go value. +func Loads(input string, opts ...Option) (any, error) { + cfg := applyOptions(opts) + p := newParser(input, false, cfg.streamStable, cfg.strict) + value, _, err := p.parse() + if err != nil { + return nil, err + } + if value == "" { + return "", nil + } + return normalizeValue(value), nil +} + +// RepairJSONWithLog takes a potentially malformed JSON string output from LLMs +// and attempts to repair it into a valid JSON string, while also returning logs +// of the repair process. +func RepairJSONWithLog(input string, opts ...Option) (any, []LogEntry, error) { + cfg := applyOptions(opts) + p := newParser(input, true, cfg.streamStable, cfg.strict) + value, logs, err := p.parse() + if err != nil { + return nil, nil, err + } + if logs == nil { + logs = []LogEntry{} + } + if value == "" { + return "", logs, nil + } + return normalizeValue(value), logs, nil +} + +func normalizeValue(value any) any { + switch v := value.(type) { + case *orderedObject: + result := map[string]any{} + for _, entry := range v.entries { + result[entry.key] = normalizeValue(entry.value) + } + return result + case []any: + items := make([]any, 0, len(v)) + for _, item := range v { + items = append(items, normalizeValue(item)) + } + return items + case numberValue: + return json.Number(v.raw) + default: + return v + } +} + +func serialize(value any, ensureASCII bool) string { + var buf bytes.Buffer + writeValue(&buf, value, ensureASCII) + return buf.String() +} + +func writeValue(buf *bytes.Buffer, value any, ensureASCII bool) { + switch v := value.(type) { + case string: + buf.WriteByte('"') + writeEscapedString(buf, v, ensureASCII) + buf.WriteByte('"') + case numberValue: + buf.WriteString(v.raw) + case json.Number: + buf.WriteString(v.String()) + case bool: + if v { + buf.WriteString("true") + } else { + buf.WriteString("false") + } + case nil: + buf.WriteString("null") + case []any: + buf.WriteByte('[') + for i, item := range v { + if i > 0 { + buf.WriteString(", ") + } + writeValue(buf, item, ensureASCII) + } + buf.WriteByte(']') + case *orderedObject: + buf.WriteByte('{') + for i, entry := range v.entries { + if i > 0 { + buf.WriteString(", ") + } + buf.WriteByte('"') + writeEscapedString(buf, entry.key, ensureASCII) + buf.WriteByte('"') + buf.WriteString(": ") + writeValue(buf, entry.value, ensureASCII) + } + buf.WriteByte('}') + case float64: + buf.WriteString(formatFloat(v)) + case int: + buf.WriteString(strconv.Itoa(v)) + case int64: + buf.WriteString(strconv.FormatInt(v, 10)) + case uint64: + buf.WriteString(strconv.FormatUint(v, 10)) + case map[string]any: + buf.WriteByte('{') + idx := 0 + for key, item := range v { + if idx > 0 { + buf.WriteString(", ") + } + buf.WriteByte('"') + writeEscapedString(buf, key, ensureASCII) + buf.WriteByte('"') + buf.WriteString(": ") + writeValue(buf, item, ensureASCII) + idx++ + } + buf.WriteByte('}') + default: + buf.WriteString("null") + } +} + +func writeEscapedString(buf *bytes.Buffer, value string, ensureASCII bool) { + for _, r := range value { + switch r { + case '\\': + buf.WriteString("\\\\") + case '"': + buf.WriteString("\\\"") + case '\b': + buf.WriteString("\\b") + case '\f': + buf.WriteString("\\f") + case '\n': + buf.WriteString("\\n") + case '\r': + buf.WriteString("\\r") + case '\t': + buf.WriteString("\\t") + default: + if r < 0x20 { + buf.WriteString("\\u") + buf.WriteString(hex4(r)) + continue + } + if ensureASCII && r > 0x7f { + if r > 0xFFFF { + for _, rr := range utf16.Encode([]rune{r}) { + buf.WriteString("\\u") + buf.WriteString(hex4(rune(rr))) + } + continue + } + buf.WriteString("\\u") + buf.WriteString(hex4(r)) + continue + } + buf.WriteRune(r) + } + } +} + +func hex4(r rune) string { + value := int(r) + result := strconv.FormatInt(int64(value), 16) + return strings.Repeat("0", 4-len(result)) + strings.ToLower(result) +} diff --git a/jsonrepair/jsonrepair_test.go b/jsonrepair/jsonrepair_test.go new file mode 100644 index 0000000000000000000000000000000000000000..5340543ff22b77e2b6cf7893e11fc613b6c32ec6 --- /dev/null +++ b/jsonrepair/jsonrepair_test.go @@ -0,0 +1,1615 @@ +package jsonrepair + +import ( + "encoding/json" + "reflect" + "strings" + "testing" +) + +func TestRepairJSON(t *testing.T) { + cases := []struct { + name string + input string + opts []Option + want string + }{ + { + name: "valid_object", + input: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}", + want: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}", + }, + { + name: "array_spacing", + input: "{\"employees\":[\"John\", \"Anna\", \"Peter\"]} ", + want: "{\"employees\": [\"John\", \"Anna\", \"Peter\"]}", + }, + { + name: "colon_in_string", + input: "{\"key\": \"value:value\"}", + want: "{\"key\": \"value:value\"}", + }, + { + name: "trailing_comma_in_string", + input: "{\"text\": \"The quick brown fox,\"}", + want: "{\"text\": \"The quick brown fox,\"}", + }, + { + name: "apostrophe_in_string", + input: "{\"text\": \"The quick brown fox won't jump\"}", + want: "{\"text\": \"The quick brown fox won't jump\"}", + }, + { + name: "missing_brace", + input: "{\"key\": \"\"", + want: "{\"key\": \"\"}", + }, + { + name: "nested_object", + input: "{\"key1\": {\"key2\": [1, 2, 3]}}", + want: "{\"key1\": {\"key2\": [1, 2, 3]}}", + }, + { + name: "large_integer", + input: "{\"key\": 12345678901234567890}", + want: "{\"key\": 12345678901234567890}", + }, + { + name: "unicode_escape", + input: "{\"key\": \"value☺\"}", + want: "{\"key\": \"value\\u263a\"}", + }, + { + name: "escaped_newline", + input: "{\"key\": \"value\\nvalue\"}", + want: "{\"key\": \"value\\nvalue\"}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input, tc.opts...) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestRepairJSONMultipleTopLevel(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "array_then_object", + input: "[]{}", + want: "[]", + }, + { + name: "array_then_object_with_value", + input: "[]{\"key\":\"value\"}", + want: "{\"key\": \"value\"}", + }, + { + name: "object_then_array", + input: "{\"key\":\"value\"}[1,2,3,True]", + want: "[{\"key\": \"value\"}, [1, 2, 3, true]]", + }, + { + name: "embedded_code_blocks", + input: "lorem ```json {\"key\":\"value\"} ``` ipsum ```json [1,2,3,True] ``` 42", + want: "[{\"key\": \"value\"}, [1, 2, 3, true]]", + }, + { + name: "array_followed_by_array", + input: "[{\"key\":\"value\"}][{\"key\":\"value_after\"}]", + want: "[{\"key\": \"value_after\"}]", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestRepairJSONEnsureASCII(t *testing.T) { + got, err := RepairJSON("{'test_中国人_ascii':'统一码'}", WithEnsureASCII(false)) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + want := "{\"test_中国人_ascii\": \"统一码\"}" + if got != want { + t.Fatalf("got %q want %q", got, want) + } +} + +func TestRepairJSONStreamStable(t *testing.T) { + cases := []struct { + name string + input string + opts []Option + want string + }{ + { + name: "default_trailing_backslash", + input: "{\"key\": \"val\\", + want: "{\"key\": \"val\\\\\"}", + }, + { + name: "default_trailing_newline", + input: "{\"key\": \"val\\n", + want: "{\"key\": \"val\"}", + }, + { + name: "default_split_object", + input: "{\"key\": \"val\\n123,`key2:value2", + want: "{\"key\": \"val\\n123\", \"key2\": \"value2\"}", + }, + { + name: "stable_trailing_backslash", + input: "{\"key\": \"val\\", + opts: []Option{WithStreamStable()}, + want: "{\"key\": \"val\"}", + }, + { + name: "stable_trailing_newline", + input: "{\"key\": \"val\\n", + opts: []Option{WithStreamStable()}, + want: "{\"key\": \"val\\n\"}", + }, + { + name: "stable_split_object", + input: "{\"key\": \"val\\n123,`key2:value2", + opts: []Option{WithStreamStable()}, + want: "{\"key\": \"val\\n123,`key2:value2\"}", + }, + { + name: "stable_complete_stream", + input: "{\"key\": \"val\\n123,`key2:value2`\"}", + opts: []Option{WithStreamStable()}, + want: "{\"key\": \"val\\n123,`key2:value2`\"}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input, tc.opts...) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestLoads(t *testing.T) { + cases := []struct { + name string + input string + want any + }{ + { + name: "empty_array", + input: "[]", + want: []any{}, + }, + { + name: "empty_object", + input: "{}", + want: map[string]any{}, + }, + { + name: "bools_nulls", + input: "{\"key\": true, \"key2\": false, \"key3\": null}", + want: map[string]any{ + "key": true, + "key2": false, + "key3": nil, + }, + }, + { + name: "simple_object", + input: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}", + want: map[string]any{ + "name": "John", + "age": json.Number("30"), + "city": "New York", + }, + }, + { + name: "array_numbers", + input: "[1, 2, 3, 4]", + want: []any{ + json.Number("1"), + json.Number("2"), + json.Number("3"), + json.Number("4"), + }, + }, + { + name: "string_array", + input: "{\"employees\":[\"John\", \"Anna\", \"Peter\"]} ", + want: map[string]any{ + "employees": []any{"John", "Anna", "Peter"}, + }, + }, + { + name: "string_quotes_repaired", + input: "[{\"foo\": \"foo bar \"foobar\" foo bar baz.\", \"tag\": \"#foo-bar-foobar\"}]", + want: []any{ + map[string]any{ + "foo": "foo bar \"foobar\" foo bar baz.", + "tag": "#foo-bar-foobar", + }, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := Loads(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !reflect.DeepEqual(got, tc.want) { + t.Fatalf("got %#v want %#v", got, tc.want) + } + }) + } +} + +func TestRepairJSONSkipJSONLoads(t *testing.T) { + cases := []struct { + name string + input string + opts []Option + want string + }{ + { + name: "valid_json", + input: "{\"key\": true, \"key2\": false, \"key3\": null}", + opts: []Option{WithSkipJSONLoads()}, + want: "{\"key\": true, \"key2\": false, \"key3\": null}", + }, + { + name: "missing_value", + input: "{\"key\": true, \"key2\": false, \"key3\": }", + opts: []Option{WithSkipJSONLoads()}, + want: "{\"key\": true, \"key2\": false, \"key3\": \"\"}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input, tc.opts...) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } + + got, err := Loads("{\"key\": true, \"key2\": false, \"key3\": }", WithSkipJSONLoads()) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + want := map[string]any{ + "key": true, + "key2": false, + "key3": "", + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("got %#v want %#v", got, want) + } +} + +func TestRepairJSONWithLog(t *testing.T) { + cases := []struct { + name string + input string + wantValue any + wantLog []LogEntry + }{ + { + name: "valid_json", + input: "{}", + wantValue: map[string]any{}, + wantLog: []LogEntry{}, + }, + { + name: "missing_quote", + input: "{\"key\": \"value}", + wantValue: map[string]any{ + "key": "value", + }, + wantLog: []LogEntry{ + { + Context: "y\": \"value}", + Text: "While parsing a string missing the left delimiter in object value context, we found a , or } and we couldn't determine that a right delimiter was present. Stopping here", + }, + { + Context: "y\": \"value}", + Text: "While parsing a string, we missed the closing quote, ignoring", + }, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + gotValue, gotLog, err := RepairJSONWithLog(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !reflect.DeepEqual(gotValue, tc.wantValue) { + t.Fatalf("got %#v want %#v", gotValue, tc.wantValue) + } + if !reflect.DeepEqual(gotLog, tc.wantLog) { + t.Fatalf("got %#v want %#v", gotLog, tc.wantLog) + } + }) + } +} + +func TestRepairJSONStrict(t *testing.T) { + cases := []struct { + name string + input string + opts []Option + wantErr string + }{ + { + name: "multiple_top_level", + input: "{\"key\":\"value\"}[\"value\"]", + opts: []Option{WithStrict()}, + wantErr: "multiple top-level JSON elements", + }, + { + name: "duplicate_keys_in_array", + input: "[{\"key\": \"first\", \"key\": \"second\"}]", + opts: []Option{WithStrict(), WithSkipJSONLoads()}, + wantErr: "duplicate key found", + }, + { + name: "empty_key", + input: "{\"\" : \"value\"}", + opts: []Option{WithStrict(), WithSkipJSONLoads()}, + wantErr: "empty key found", + }, + { + name: "missing_colon", + input: "{\"missing\" \"colon\"}", + opts: []Option{WithStrict()}, + wantErr: "missing ':' after key", + }, + { + name: "empty_value", + input: "{\"key\": , \"key2\": \"value2\"}", + opts: []Option{WithStrict(), WithSkipJSONLoads()}, + wantErr: "parsed value is empty", + }, + { + name: "empty_object_with_extra", + input: "{\"dangling\"}", + opts: []Option{WithStrict()}, + wantErr: "parsed object is empty", + }, + { + name: "immediate_doubled_quotes", + input: "{\"key\": \"\"\"\"}", + opts: []Option{WithStrict()}, + wantErr: "doubled quotes followed by another quote", + }, + { + name: "doubled_quotes_followed_by_string", + input: "{\"key\": \"\" \"value\"}", + opts: []Option{WithStrict()}, + wantErr: "doubled quotes followed by another quote while parsing a string", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + _, err := RepairJSON(tc.input, tc.opts...) + if err == nil { + t.Fatalf("expected error") + } + if !strings.Contains(err.Error(), tc.wantErr) { + t.Fatalf("got %q want %q", err.Error(), tc.wantErr) + } + }) + } +} + +func TestParseArrayObjects(t *testing.T) { + cases := []struct { + name string + input string + want any + }{ + { + name: "empty_array", + input: "[]", + want: []any{}, + }, + { + name: "numbers_array", + input: "[1, 2, 3, 4]", + want: []any{ + json.Number("1"), + json.Number("2"), + json.Number("3"), + json.Number("4"), + }, + }, + { + name: "unfinished_array", + input: "[", + want: []any{}, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := Loads(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !reflect.DeepEqual(got, tc.want) { + t.Fatalf("got %#v want %#v", got, tc.want) + } + }) + } +} + +func TestParseArrayEdgeCases(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "nested_newlines", + input: "[[1\n\n]", + want: "[[1]]", + }, + { + name: "array_with_object_end", + input: "[{]", + want: "[]", + }, + { + name: "just_open_bracket", + input: "[", + want: "[]", + }, + { + name: "dangling_quote", + input: "[\"", + want: "[]", + }, + { + name: "just_close_bracket", + input: "]", + want: "", + }, + { + name: "trailing_comma", + input: "[1, 2, 3,", + want: "[1, 2, 3]", + }, + { + name: "ellipsis_end", + input: "[1, 2, 3, ...]", + want: "[1, 2, 3]", + }, + { + name: "ellipsis_middle", + input: "[1, 2, ... , 3]", + want: "[1, 2, 3]", + }, + { + name: "ellipsis_string", + input: "[1, 2, '...', 3]", + want: "[1, 2, \"...\", 3]", + }, + { + name: "ellipsis_bools", + input: "[true, false, null, ...]", + want: "[true, false, null]", + }, + { + name: "missing_commas", + input: "[\"a\" \"b\" \"c\" 1", + want: "[\"a\", \"b\", \"c\", 1]", + }, + { + name: "object_array_missing_end", + input: "{\"employees\":[\"John\", \"Anna\",", + want: "{\"employees\": [\"John\", \"Anna\"]}", + }, + { + name: "object_array_missing_quote", + input: "{\"employees\":[\"John\", \"Anna\", \"Peter", + want: "{\"employees\": [\"John\", \"Anna\", \"Peter\"]}", + }, + { + name: "nested_object_array", + input: "{\"key1\": {\"key2\": [1, 2, 3", + want: "{\"key1\": {\"key2\": [1, 2, 3]}}", + }, + { + name: "missing_array_quote", + input: "{\"key\": [\"value]}", + want: "{\"key\": [\"value\"]}", + }, + { + name: "embedded_quotes", + input: "[\"lorem \"ipsum\" sic\"]", + want: "[\"lorem \\\"ipsum\\\" sic\"]", + }, + { + name: "array_closes_object", + input: "{\"key1\": [\"value1\", \"value2\"}, \"key2\": [\"value3\", \"value4\"]}", + want: "{\"key1\": [\"value1\", \"value2\"], \"key2\": [\"value3\", \"value4\"]}", + }, + { + name: "rows_missing_bracket", + input: "{\"headers\": [\"A\", \"B\", \"C\"], \"rows\": [[\"r1a\", \"r1b\", \"r1c\"], [\"r2a\", \"r2b\", \"r2c\"], \"r3a\", \"r3b\", \"r3c\"], [\"r4a\", \"r4b\", \"r4c\"], [\"r5a\", \"r5b\", \"r5c\"]]}", + want: "{\"headers\": [\"A\", \"B\", \"C\"], \"rows\": [[\"r1a\", \"r1b\", \"r1c\"], [\"r2a\", \"r2b\", \"r2c\"], [\"r3a\", \"r3b\", \"r3c\"], [\"r4a\", \"r4b\", \"r4c\"], [\"r5a\", \"r5b\", \"r5c\"]]}", + }, + { + name: "array_missing_commas", + input: "{\"key\": [\"value\" \"value1\" \"value2\"]}", + want: "{\"key\": [\"value\", \"value1\", \"value2\"]}", + }, + { + name: "array_many_quotes", + input: "{\"key\": [\"lorem \"ipsum\" dolor \"sit\" amet, \"consectetur\" \", \"lorem \"ipsum\" dolor\", \"lorem\"]}", + want: "{\"key\": [\"lorem \\\"ipsum\\\" dolor \\\"sit\\\" amet, \\\"consectetur\\\" \", \"lorem \\\"ipsum\\\" dolor\", \"lorem\"]}", + }, + { + name: "quoted_key_characters", + input: "{\"k\"e\"y\": \"value\"}", + want: "{\"k\\\"e\\\"y\": \"value\"}", + }, + { + name: "array_object_mixed", + input: "[\"key\":\"value\"}]", + want: "[{\"key\": \"value\"}]", + }, + { + name: "array_object_followed_by_literal", + input: "[{\"key\": \"value\", \"key", + want: "[{\"key\": \"value\"}, [\"key\"]]", + }, + { + name: "set_like_array", + input: "{'key1', 'key2'}", + want: "[\"key1\", \"key2\"]", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestParseArrayMissingQuotes(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "value_missing_quote", + input: "[\"value1\" value2\", \"value3\"]", + want: "[\"value1\", \"value2\", \"value3\"]", + }, + { + name: "comment_token", + input: "{\"bad_one\":[\"Lorem Ipsum\", \"consectetur\" comment\" ], \"good_one\":[ \"elit\", \"sed\", \"tempor\"]}", + want: "{\"bad_one\": [\"Lorem Ipsum\", \"consectetur\", \"comment\"], \"good_one\": [\"elit\", \"sed\", \"tempor\"]}", + }, + { + name: "comment_token_no_space", + input: "{\"bad_one\": [\"Lorem Ipsum\",\"consectetur\" comment],\"good_one\": [\"elit\",\"sed\",\"tempor\"]}", + want: "{\"bad_one\": [\"Lorem Ipsum\", \"consectetur\", \"comment\"], \"good_one\": [\"elit\", \"sed\", \"tempor\"]}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestParseComment(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "just_slash", + input: "/", + want: "", + }, + { + name: "block_comment_prefix", + input: "/* comment */ {\"key\": \"value\"}", + want: "{\"key\": \"value\"}", + }, + { + name: "line_comment", + input: "{ \"key\": { \"key2\": \"value2\" // comment }, \"key3\": \"value3\" }", + want: "{\"key\": {\"key2\": \"value2\"}, \"key3\": \"value3\"}", + }, + { + name: "hash_comment", + input: "{ \"key\": { \"key2\": \"value2\" # comment }, \"key3\": \"value3\" }", + want: "{\"key\": {\"key2\": \"value2\"}, \"key3\": \"value3\"}", + }, + { + name: "block_comment_inside", + input: "{ \"key\": { \"key2\": \"value2\" /* comment */ }, \"key3\": \"value3\" }", + want: "{\"key\": {\"key2\": \"value2\"}, \"key3\": \"value3\"}", + }, + { + name: "array_block_comment", + input: "[ \"value\", /* comment */ \"value2\" ]", + want: "[\"value\", \"value2\"]", + }, + { + name: "unterminated_comment", + input: "{ \"key\": \"value\" /* comment", + want: "{\"key\": \"value\"}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestParseNumber(t *testing.T) { + cases := []struct { + name string + input string + want any + }{ + { + name: "integer", + input: "1", + want: json.Number("1"), + }, + { + name: "float", + input: "1.2", + want: json.Number("1.2"), + }, + { + name: "underscored_integer", + input: "{\"value\": 82_461_110}", + want: map[string]any{ + "value": json.Number("82461110"), + }, + }, + { + name: "underscored_float", + input: "{\"value\": 1_234.5_6}", + want: map[string]any{ + "value": json.Number("1234.56"), + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := Loads(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !reflect.DeepEqual(got, tc.want) { + t.Fatalf("got %#v want %#v", got, tc.want) + } + }) + } +} + +func TestParseNumberEdgeCases(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "leading_dash", + input: " - { \"test_key\": [\"test_value\", \"test_value2\"] }", + want: "{\"test_key\": [\"test_value\", \"test_value2\"]}", + }, + { + name: "fraction", + input: "{\"key\": 1/3}", + want: "{\"key\": \"1/3\"}", + }, + { + name: "leading_decimal", + input: "{\"key\": .25}", + want: "{\"key\": 0.25}", + }, + { + name: "fraction_in_object", + input: "{\"here\": \"now\", \"key\": 1/3, \"foo\": \"bar\"}", + want: "{\"here\": \"now\", \"key\": \"1/3\", \"foo\": \"bar\"}", + }, + { + name: "fraction_long", + input: "{\"key\": 12345/67890}", + want: "{\"key\": \"12345/67890\"}", + }, + { + name: "array_incomplete", + input: "[105,12", + want: "[105, 12]", + }, + { + name: "object_numbers", + input: "{\"key\", 105,12,", + want: "{\"key\": \"105,12\"}", + }, + { + name: "fraction_trailing", + input: "{\"key\": 1/3, \"foo\": \"bar\"}", + want: "{\"key\": \"1/3\", \"foo\": \"bar\"}", + }, + { + name: "dash_number", + input: "{\"key\": 10-20}", + want: "{\"key\": \"10-20\"}", + }, + { + name: "double_dot", + input: "{\"key\": 1.1.1}", + want: "{\"key\": \"1.1.1\"}", + }, + { + name: "dash_array", + input: "[- ", + want: "[]", + }, + { + name: "trailing_decimal", + input: "{\"key\": 1. }", + want: "{\"key\": 1.0}", + }, + { + name: "exponent", + input: "{\"key\": 1e10 }", + want: "{\"key\": 10000000000.0}", + }, + { + name: "bad_exponent", + input: "{\"key\": 1e }", + want: "{\"key\": 1}", + }, + { + name: "non_number_suffix", + input: "{\"key\": 1notanumber }", + want: "{\"key\": \"1notanumber\"}", + }, + { + name: "uuid_literal", + input: "{\"rowId\": 57eeeeb1-450b-482c-81b9-4be77e95dee2}", + want: "{\"rowId\": \"57eeeeb1-450b-482c-81b9-4be77e95dee2\"}", + }, + { + name: "array_non_number", + input: "[1, 2notanumber]", + want: "[1, \"2notanumber\"]", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestParseObjectObjects(t *testing.T) { + cases := []struct { + name string + input string + want any + }{ + { + name: "empty_object", + input: "{}", + want: map[string]any{}, + }, + { + name: "object_values", + input: "{ \"key\": \"value\", \"key2\": 1, \"key3\": True }", + want: map[string]any{ + "key": "value", + "key2": json.Number("1"), + "key3": true, + }, + }, + { + name: "unfinished_object", + input: "{", + want: map[string]any{}, + }, + { + name: "object_with_literals", + input: "{ \"key\": value, \"key2\": 1 \"key3\": null }", + want: map[string]any{ + "key": "value", + "key2": json.Number("1"), + "key3": nil, + }, + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := Loads(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !reflect.DeepEqual(got, tc.want) { + t.Fatalf("got %#v want %#v", got, tc.want) + } + }) + } +} + +func TestParseObjectEdgeCases(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "empty_trim", + input: " { } ", + want: "{}", + }, + { + name: "just_open", + input: "{", + want: "{}", + }, + { + name: "just_close", + input: "}", + want: "", + }, + { + name: "dangling_quote", + input: "{\"", + want: "{}", + }, + { + name: "object_array_merge", + input: "{foo: [}", + want: "{\"foo\": []}", + }, + { + name: "empty_key", + input: "{\"\": \"value\"", + want: "{\"\": \"value\"}", + }, + { + name: "embedded_quotes", + input: "{\"key\": \"v\"alue\"}", + want: "{\"key\": \"v\\\"alue\\\"\"}", + }, + { + name: "comment_literal", + input: "{\"value_1\": true, COMMENT \"value_2\": \"data\"}", + want: "{\"value_1\": true, \"value_2\": \"data\"}", + }, + { + name: "comment_literal_trailing", + input: "{\"value_1\": true, SHOULD_NOT_EXIST \"value_2\": \"data\" AAAA }", + want: "{\"value_1\": true, \"value_2\": \"data\"}", + }, + { + name: "empty_key_bool", + input: "{\"\" : true, \"key2\": \"value2\"}", + want: "{\"\": true, \"key2\": \"value2\"}", + }, + { + name: "double_quotes", + input: "{\"\"answer\"\":[{\"\"traits\"\":''Female aged 60+'',\"\"answer1\"\":\"\"5\"\"}]}", + want: "{\"answer\": [{\"traits\": \"Female aged 60+\", \"answer1\": \"5\"}]}", + }, + { + name: "missing_quotes", + input: "{ \"words\": abcdef\", \"numbers\": 12345\", \"words2\": ghijkl\" }", + want: "{\"words\": \"abcdef\", \"numbers\": 12345, \"words2\": \"ghijkl\"}", + }, + { + name: "broken_split_key", + input: "{\"number\": 1,\"reason\": \"According...\"\"ans\": \"YES\"}", + want: "{\"number\": 1, \"reason\": \"According...\", \"ans\": \"YES\"}", + }, + { + name: "nested_braces_in_string", + input: "{ \"a\" : \"{ b\": {} }\" }", + want: "{\"a\": \"{ b\"}", + }, + { + name: "literal_after_string", + input: "{\"b\": \"xxxxx\" true}", + want: "{\"b\": \"xxxxx\"}", + }, + { + name: "string_with_quotes", + input: "{\"key\": \"Lorem \"ipsum\" s,\"}", + want: "{\"key\": \"Lorem \\\"ipsum\\\" s,\"}", + }, + { + name: "literal_list", + input: "{\"lorem\": ipsum, sic, datum.\",}", + want: "{\"lorem\": \"ipsum, sic, datum.\"}", + }, + { + name: "multiple_keys", + input: "{\"lorem\": sic tamet. \"ipsum\": sic tamet, quick brown fox. \"sic\": ipsum}", + want: "{\"lorem\": \"sic tamet.\", \"ipsum\": \"sic tamet\", \"sic\": \"ipsum\"}", + }, + { + name: "unfinished_string", + input: "{\"lorem_ipsum\": \"sic tamet, quick brown fox. }", + want: "{\"lorem_ipsum\": \"sic tamet, quick brown fox.\"}", + }, + { + name: "missing_quotes_keys", + input: "{\"key\":value, \" key2\":\"value2\" }", + want: "{\"key\": \"value\", \" key2\": \"value2\"}", + }, + { + name: "missing_quotes_key_separator", + input: "{\"key\":value \"key2\":\"value2\" }", + want: "{\"key\": \"value\", \"key2\": \"value2\"}", + }, + { + name: "single_quotes_braces", + input: "{'text': 'words{words in brackets}more words'}", + want: "{\"text\": \"words{words in brackets}more words\"}", + }, + { + name: "literal_with_braces", + input: "{text:words{words in brackets}}", + want: "{\"text\": \"words{words in brackets}\"}", + }, + { + name: "literal_with_braces_suffix", + input: "{text:words{words in brackets}m}", + want: "{\"text\": \"words{words in brackets}m\"}", + }, + { + name: "trailing_markdown", + input: "{\"key\": \"value, value2\"```", + want: "{\"key\": \"value, value2\"}", + }, + { + name: "trailing_markdown_quote", + input: "{\"key\": \"value}```", + want: "{\"key\": \"value\"}", + }, + { + name: "bare_keys", + input: "{key:value,key2:value2}", + want: "{\"key\": \"value\", \"key2\": \"value2\"}", + }, + { + name: "missing_key_quote", + input: "{\"key:\"value\"}", + want: "{\"key\": \"value\"}", + }, + { + name: "missing_value_quote", + input: "{\"key:value}", + want: "{\"key\": \"value\"}", + }, + { + name: "array_double_quotes", + input: "[{\"lorem\": {\"ipsum\": \"sic\"}, \"\"\"\" \"lorem\": {\"ipsum\": \"sic\"}]", + want: "[{\"lorem\": {\"ipsum\": \"sic\"}}, {\"lorem\": {\"ipsum\": \"sic\"}}]", + }, + { + name: "arrays_in_object", + input: "{ \"key\": [\"arrayvalue\"], [\"arrayvalue1\"], [\"arrayvalue2\"], \"key3\": \"value3\" }", + want: "{\"key\": [\"arrayvalue\", \"arrayvalue1\", \"arrayvalue2\"], \"key3\": \"value3\"}", + }, + { + name: "nested_arrays_in_object", + input: "{ \"key\": [[1, 2, 3], \"a\", \"b\"], [[4, 5, 6], [7, 8, 9]] }", + want: "{\"key\": [[1, 2, 3], \"a\", \"b\", [4, 5, 6], [7, 8, 9]]}", + }, + { + name: "array_key_missing_value", + input: "{ \"key\": [\"arrayvalue\"], \"key3\": \"value3\", [\"arrayvalue1\"] }", + want: "{\"key\": [\"arrayvalue\"], \"key3\": \"value3\", \"arrayvalue1\": \"\"}", + }, + { + name: "json_string_literal", + input: "{\"key\": \"{\\\"key\\\":[\\\"value\\\"],\\\"key2\\\":\"value2\"}\"}", + want: "{\"key\": \"{\\\"key\\\":[\\\"value\\\"],\\\"key2\\\":\\\"value2\\\"}\"}", + }, + { + name: "empty_value", + input: "{\"key\": , \"key2\": \"value2\"}", + want: "{\"key\": \"\", \"key2\": \"value2\"}", + }, + { + name: "array_missing_object_end", + input: "{\"array\":[{\"key\": \"value\"], \"key2\": \"value2\"}", + want: "{\"array\": [{\"key\": \"value\"}], \"key2\": \"value2\"}", + }, + { + name: "object_double_close", + input: "[{\"key\":\"value\"}},{\"key\":\"value\"}]", + want: "[{\"key\": \"value\"}, {\"key\": \"value\"}]", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestParseObjectMergeAtEnd(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "merge_key", + input: "{\"key\": \"value\"}, \"key2\": \"value2\"}", + want: "{\"key\": \"value\", \"key2\": \"value2\"}", + }, + { + name: "merge_empty_value", + input: "{\"key\": \"value\"}, \"key2\": }", + want: "{\"key\": \"value\", \"key2\": \"\"}", + }, + { + name: "merge_array_discard", + input: "{\"key\": \"value\"}, []", + want: "{\"key\": \"value\"}", + }, + { + name: "merge_array_keep", + input: "{\"key\": \"value\"}, [\"abc\"]", + want: "[{\"key\": \"value\"}, [\"abc\"]]", + }, + { + name: "merge_object", + input: "{\"key\": \"value\"}, {}", + want: "{\"key\": \"value\"}", + }, + { + name: "merge_empty_key", + input: "{\"key\": \"value\"}, \"\" : \"value2\"}", + want: "{\"key\": \"value\", \"\": \"value2\"}", + }, + { + name: "merge_missing_colon", + input: "{\"key\": \"value\"}, \"key2\" \"value2\"}", + want: "{\"key\": \"value\", \"key2\": \"value2\"}", + }, + { + name: "merge_multiple_keys", + input: "{\"key1\": \"value1\"}, \"key2\": \"value2\", \"key3\": \"value3\"}", + want: "{\"key1\": \"value1\", \"key2\": \"value2\", \"key3\": \"value3\"}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestParseStringBasics(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "just_quote", + input: "\"", + want: "", + }, + { + name: "newline", + input: "\n", + want: "", + }, + { + name: "space", + input: " ", + want: "", + }, + { + name: "string_literal", + input: "string", + want: "", + }, + { + name: "string_before_object", + input: "stringbeforeobject {}", + want: "{}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestMissingAndMixedQuotes(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "mixed_quotes", + input: "{'key': 'string', 'key2': false, \"key3\": null, \"key4\": unquoted}", + want: "{\"key\": \"string\", \"key2\": false, \"key3\": null, \"key4\": \"unquoted\"}", + }, + { + name: "missing_last_quote", + input: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York", + want: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}", + }, + { + name: "missing_quotes_key", + input: "{\"name\": \"John\", \"age\": 30, city: \"New York\"}", + want: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}", + }, + { + name: "missing_quotes_value", + input: "{\"name\": \"John\", \"age\": 30, \"city\": New York}", + want: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}", + }, + { + name: "missing_quotes_value_with_name", + input: "{\"name\": John, \"age\": 30, \"city\": \"New York\"}", + want: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\"}", + }, + { + name: "slanted_delimiter", + input: "{“slanted_delimiter”: \"value\"}", + want: "{\"slanted_delimiter\": \"value\"}", + }, + { + name: "shortened_string", + input: "{\"name\": \"John\", \"age\": 30, \"city\": \"New", + want: "{\"name\": \"John\", \"age\": 30, \"city\": \"New\"}", + }, + { + name: "missing_quote_in_middle", + input: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York, \"gender\": \"male\"}", + want: "{\"name\": \"John\", \"age\": 30, \"city\": \"New York\", \"gender\": \"male\"}", + }, + { + name: "comment_literal_in_array", + input: "[{\"key\": \"value\", COMMENT \"notes\": \"lorem \"ipsum\", sic.\" }]", + want: "[{\"key\": \"value\", \"notes\": \"lorem \\\"ipsum\\\", sic.\"}]", + }, + { + name: "double_quote_prefix", + input: "{\"key\": \"\"value\"}", + want: "{\"key\": \"value\"}", + }, + { + name: "numeric_key", + input: "{\"key\": \"value\", 5: \"value\"}", + want: "{\"key\": \"value\", \"5\": \"value\"}", + }, + { + name: "escaped_quotes", + input: "{\"foo\": \"\\\\\"bar\\\\\"\"", + want: "{\"foo\": \"\\\"bar\\\"\"}", + }, + { + name: "empty_key_prefix", + input: "{\"\" key\":\"val\"", + want: "{\" key\": \"val\"}", + }, + { + name: "missing_comma", + input: "{\"key\": value \"key2\" : \"value2\" ", + want: "{\"key\": \"value\", \"key2\": \"value2\"}", + }, + { + name: "ellipsis_quotes", + input: "{\"key\": \"lorem ipsum ... \"sic \" tamet. ...}", + want: "{\"key\": \"lorem ipsum ... \\\"sic \\\" tamet. ...\"}", + }, + { + name: "trailing_comma", + input: "{\"key\": value , }", + want: "{\"key\": \"value\"}", + }, + { + name: "comment_in_string", + input: "{\"comment\": \"lorem, \"ipsum\" sic \"tamet\". To improve\"}", + want: "{\"comment\": \"lorem, \\\"ipsum\\\" sic \\\"tamet\\\". To improve\"}", + }, + { + name: "value_with_embedded_quotes", + input: "{\"key\": \"v\"alu\"e\"} key:", + want: "{\"key\": \"v\\\"alu\\\"e\"}", + }, + { + name: "value_with_embedded_quote", + input: "{\"key\": \"v\"alue\", \"key2\": \"value2\"}", + want: "{\"key\": \"v\\\"alue\", \"key2\": \"value2\"}", + }, + { + name: "array_value_with_quote", + input: "[{\"key\": \"v\"alu,e\", \"key2\": \"value2\"}]", + want: "[{\"key\": \"v\\\"alu,e\", \"key2\": \"value2\"}]", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestEscaping(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "just_quotes", + input: "'\"'", + want: "", + }, + { + name: "escaped_chars", + input: "{\"key\": 'string\"\n\t\\\\le'", + want: "{\"key\": \"string\\\"\\n\\t\\\\le\"}", + }, + { + name: "html_escape", + input: "{\"real_content\": \"Some string: Some other string \\t Some string Some link\"", + want: "{\"real_content\": \"Some string: Some other string \\t Some string Some link\"}", + }, + { + name: "newline_in_key", + input: "{\"key_1\n\": \"value\"}", + want: "{\"key_1\": \"value\"}", + }, + { + name: "tab_in_key", + input: "{\"key\t_\": \"value\"}", + want: "{\"key\\t_\": \"value\"}", + }, + { + name: "unicode_escape", + input: "{\"key\": '\u0076\u0061\u006c\u0075\u0065'}", + want: "{\"key\": \"value\"}", + }, + { + name: "unicode_escape_skip_loads", + input: "{\"key\": \"\\u0076\\u0061\\u006C\\u0075\\u0065\"}", + want: "{\"key\": \"value\"}", + }, + { + name: "escaped_single_quote", + input: "{\"key\": \"valu\\'e\"}", + want: "{\"key\": \"valu'e\"}", + }, + { + name: "escaped_object", + input: "{'key': \"{\\\"key\\\": 1, \\\"key2\\\": 1}\"}", + want: "{\"key\": \"{\\\"key\\\": 1, \\\"key2\\\": 1}\"}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + opts := []Option{} + if tc.name == "unicode_escape_skip_loads" { + opts = append(opts, WithSkipJSONLoads()) + } + got, err := RepairJSON(tc.input, opts...) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestMarkdown(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "markdown_link", + input: "{ \"content\": \"[LINK](\"https://google.com\")\" }", + want: "{\"content\": \"[LINK](\\\"https://google.com\\\")\"}", + }, + { + name: "markdown_incomplete", + input: "{ \"content\": \"[LINK](\" }", + want: "{\"content\": \"[LINK](\"}", + }, + { + name: "markdown_in_object", + input: "{ \"content\": \"[LINK](\", \"key\": true }", + want: "{\"content\": \"[LINK](\", \"key\": true}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestLeadingTrailingCharacters(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "wrapped_markdown", + input: "````{ \"key\": \"value\" }```", + want: "{\"key\": \"value\"}", + }, + { + name: "trailing_markdown_block", + input: "{ \"a\": \"\", \"b\": [ { \"c\": 1} ] \n}```", + want: "{\"a\": \"\", \"b\": [{\"c\": 1}]}", + }, + { + name: "preface_text", + input: "Based on the information extracted, here is the filled JSON output: ```json { 'a': 'b' } ```", + want: "{\"a\": \"b\"}", + }, + { + name: "multiline_markdown", + input: "\n The next 64 elements are:\n ```json\n { \"key\": \"value\" }\n ```", + want: "{\"key\": \"value\"}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestStringJSONLLMBlock(t *testing.T) { + cases := []struct { + name string + input string + want string + }{ + { + name: "backticks", + input: "{\"key\": \"``\"", + want: "{\"key\": \"``\"}", + }, + { + name: "backticks_json", + input: "{\"key\": \"```json\"", + want: "{\"key\": \"```json\"}", + }, + { + name: "json_block_inside_string", + input: "{\"key\": \"```json {\"key\": [{\"key1\": 1},{\"key2\": 2}]}```\"}", + want: "{\"key\": {\"key\": [{\"key1\": 1}, {\"key2\": 2}]}}", + }, + { + name: "response_prefix", + input: "{\"response\": \"```json{}\"", + want: "{\"response\": \"```json{}\"}", + }, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +} + +func TestParseBooleanOrNull(t *testing.T) { + loadCases := []struct { + name string + input string + want any + }{ + { + name: "upper_true", + input: "True", + want: "", + }, + { + name: "upper_false", + input: "False", + want: "", + }, + { + name: "upper_null", + input: "Null", + want: "", + }, + { + name: "lower_true", + input: "true", + want: true, + }, + { + name: "lower_false", + input: "false", + want: false, + }, + { + name: "lower_null", + input: "null", + want: nil, + }, + } + + for _, tc := range loadCases { + t.Run(tc.name, func(t *testing.T) { + got, err := Loads(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if !reflect.DeepEqual(got, tc.want) { + t.Fatalf("got %#v want %#v", got, tc.want) + } + }) + } + + stringCases := []struct { + name string + input string + want string + }{ + { + name: "bools_in_object", + input: " {\"key\": true, \"key2\": false, \"key3\": null}", + want: "{\"key\": true, \"key2\": false, \"key3\": null}", + }, + { + name: "uppercase_bools", + input: "{\"key\": TRUE, \"key2\": FALSE, \"key3\": Null} ", + want: "{\"key\": true, \"key2\": false, \"key3\": null}", + }, + } + + for _, tc := range stringCases { + t.Run(tc.name, func(t *testing.T) { + got, err := RepairJSON(tc.input) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if got != tc.want { + t.Fatalf("got %q want %q", got, tc.want) + } + }) + } +}