From fae0f2e82da57a0e0335d86b417a819121f4e69f Mon Sep 17 00:00:00 2001 From: Christian Rocha Date: Wed, 4 Mar 2026 15:09:01 -0700 Subject: [PATCH] fix(lsp/edit): properly handle non-ascii chars (e.g. CJK) (#2325) --- go.mod | 2 +- go.sum | 4 +- internal/lsp/client.go | 2 +- internal/lsp/handlers.go | 25 +-- internal/lsp/util/edit.go | 61 +++++-- internal/lsp/util/edit_test.go | 312 ++++++++++++++++++++++++++++++++- 6 files changed, 377 insertions(+), 29 deletions(-) diff --git a/go.mod b/go.mod index b708ffa91080bdfc7e13658a8069f1e706e8b477..26fcbaf6e17fe9032e8c0f063eb69a360ca70734 100644 --- a/go.mod +++ b/go.mod @@ -31,7 +31,7 @@ require ( github.com/charmbracelet/x/exp/ordered v0.1.0 github.com/charmbracelet/x/exp/slice v0.0.0-20260209194814-eeb2896ac759 github.com/charmbracelet/x/exp/strings v0.1.0 - github.com/charmbracelet/x/powernap v0.1.2 + github.com/charmbracelet/x/powernap v0.1.3 github.com/charmbracelet/x/term v0.2.2 github.com/clipperhouse/displaywidth v0.11.0 github.com/clipperhouse/uax29/v2 v2.7.0 diff --git a/go.sum b/go.sum index 9f58e3288eb0c39fd3a80e09601c85a2be9ec643..d759ae75ee70b1266f6a270d6e4784040afe348d 100644 --- a/go.sum +++ b/go.sum @@ -120,8 +120,8 @@ github.com/charmbracelet/x/exp/strings v0.1.0 h1:i69S2XI7uG1u4NLGeJPSYU++Nmjvpo9 github.com/charmbracelet/x/exp/strings v0.1.0/go.mod h1:/ehtMPNh9K4odGFkqYJKpIYyePhdp1hLBRvyY4bWkH8= github.com/charmbracelet/x/json v0.2.0 h1:DqB+ZGx2h+Z+1s98HOuOyli+i97wsFQIxP2ZQANTPrQ= github.com/charmbracelet/x/json v0.2.0/go.mod h1:opFIflx2YgXgi49xVUu8gEQ21teFAxyMwvOiZhIvWNM= -github.com/charmbracelet/x/powernap v0.1.2 h1:5i0zWrqE5c9PY+H90DXg/5+jHNZ5FgXOyOU4dzOSIxs= -github.com/charmbracelet/x/powernap v0.1.2/go.mod h1:cmdl5zlP5mR8TF2Y68UKc7hdGUDiSJ2+4hk0h04Hsx4= +github.com/charmbracelet/x/powernap v0.1.3 h1:rmxdQelSPB1QAgLRNMLOrgCTq3q2RXoLOJ2ZTwKG17g= +github.com/charmbracelet/x/powernap v0.1.3/go.mod h1:cmdl5zlP5mR8TF2Y68UKc7hdGUDiSJ2+4hk0h04Hsx4= github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk= github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI= github.com/charmbracelet/x/termios v0.1.1 h1:o3Q2bT8eqzGnGPOYheoYS8eEleT5ZVNYNy8JawjaNZY= diff --git a/internal/lsp/client.go b/internal/lsp/client.go index 973433801d586f2dc93d6156270308bda2e3d31f..2aa1b49a781489598f865cfd067d1adf5d68b7aa 100644 --- a/internal/lsp/client.go +++ b/internal/lsp/client.go @@ -194,7 +194,7 @@ func (c *Client) createPowernapClient() error { // registerHandlers registers the standard LSP notification and request handlers. func (c *Client) registerHandlers() { - c.RegisterServerRequestHandler("workspace/applyEdit", HandleApplyEdit) + c.RegisterServerRequestHandler("workspace/applyEdit", HandleApplyEdit(c.client.GetOffsetEncoding())) c.RegisterServerRequestHandler("workspace/configuration", HandleWorkspaceConfiguration) c.RegisterServerRequestHandler("client/registerCapability", HandleRegisterCapability) c.RegisterNotificationHandler("window/showMessage", func(ctx context.Context, method string, params json.RawMessage) { diff --git a/internal/lsp/handlers.go b/internal/lsp/handlers.go index 63ad93b2dd1cbc856eda4a9e41884dfc27e93870..5c791e7ac61a821628db3508bf072e569b9e7aaa 100644 --- a/internal/lsp/handlers.go +++ b/internal/lsp/handlers.go @@ -6,6 +6,7 @@ import ( "log/slog" "github.com/charmbracelet/crush/internal/lsp/util" + powernap "github.com/charmbracelet/x/powernap/pkg/lsp" "github.com/charmbracelet/x/powernap/pkg/lsp/protocol" ) @@ -44,19 +45,21 @@ func HandleRegisterCapability(_ context.Context, _ string, params json.RawMessag } // HandleApplyEdit handles workspace edit requests -func HandleApplyEdit(_ context.Context, _ string, params json.RawMessage) (any, error) { - var edit protocol.ApplyWorkspaceEditParams - if err := json.Unmarshal(params, &edit); err != nil { - return nil, err - } +func HandleApplyEdit(encoding powernap.OffsetEncoding) func(_ context.Context, _ string, params json.RawMessage) (any, error) { + return func(_ context.Context, _ string, params json.RawMessage) (any, error) { + var edit protocol.ApplyWorkspaceEditParams + if err := json.Unmarshal(params, &edit); err != nil { + return nil, err + } - err := util.ApplyWorkspaceEdit(edit.Edit) - if err != nil { - slog.Error("Error applying workspace edit", "error", err) - return protocol.ApplyWorkspaceEditResult{Applied: false, FailureReason: err.Error()}, nil - } + err := util.ApplyWorkspaceEdit(edit.Edit, encoding) + if err != nil { + slog.Error("Error applying workspace edit", "error", err) + return protocol.ApplyWorkspaceEditResult{Applied: false, FailureReason: err.Error()}, nil + } - return protocol.ApplyWorkspaceEditResult{Applied: true}, nil + return protocol.ApplyWorkspaceEditResult{Applied: true}, nil + } } // FileWatchRegistrationHandler is a function that will be called when file watch registrations are received diff --git a/internal/lsp/util/edit.go b/internal/lsp/util/edit.go index 23e9d479f2223773bbd0db41cc049bf8a02f3357..a2b9eef9e917552d48dcf43b14d68554d2209b6f 100644 --- a/internal/lsp/util/edit.go +++ b/internal/lsp/util/edit.go @@ -7,10 +7,11 @@ import ( "sort" "strings" + powernap "github.com/charmbracelet/x/powernap/pkg/lsp" "github.com/charmbracelet/x/powernap/pkg/lsp/protocol" ) -func applyTextEdits(uri protocol.DocumentURI, edits []protocol.TextEdit) error { +func applyTextEdits(uri protocol.DocumentURI, edits []protocol.TextEdit, encoding powernap.OffsetEncoding) error { path, err := uri.Path() if err != nil { return fmt.Errorf("invalid URI: %w", err) @@ -57,7 +58,7 @@ func applyTextEdits(uri protocol.DocumentURI, edits []protocol.TextEdit) error { // Apply each edit for _, edit := range sortedEdits { - newLines, err := applyTextEdit(lines, edit) + newLines, err := applyTextEdit(lines, edit, encoding) if err != nil { return fmt.Errorf("failed to apply edit: %w", err) } @@ -85,13 +86,11 @@ func applyTextEdits(uri protocol.DocumentURI, edits []protocol.TextEdit) error { return nil } -func applyTextEdit(lines []string, edit protocol.TextEdit) ([]string, error) { +func applyTextEdit(lines []string, edit protocol.TextEdit, encoding powernap.OffsetEncoding) ([]string, error) { startLine := int(edit.Range.Start.Line) endLine := int(edit.Range.End.Line) - startChar := int(edit.Range.Start.Character) - endChar := int(edit.Range.End.Character) - // Validate positions + // Validate positions before accessing lines. if startLine < 0 || startLine >= len(lines) { return nil, fmt.Errorf("invalid start line: %d", startLine) } @@ -99,6 +98,26 @@ func applyTextEdit(lines []string, edit protocol.TextEdit) ([]string, error) { endLine = len(lines) - 1 } + var startChar, endChar int + switch encoding { + case powernap.UTF8: + // UTF-8: Character offset is already a byte offset + startChar = int(edit.Range.Start.Character) + endChar = int(edit.Range.End.Character) + case powernap.UTF16: + // UTF-16 (default): Convert to byte offset + startLineContent := lines[startLine] + endLineContent := lines[endLine] + startChar = powernap.PositionToByteOffset(startLineContent, edit.Range.Start.Character) + endChar = powernap.PositionToByteOffset(endLineContent, edit.Range.End.Character) + default: + // UTF-32: Character offset is codepoint count, convert to byte offset + startLineContent := lines[startLine] + endLineContent := lines[endLine] + startChar = utf32ToByteOffset(startLineContent, edit.Range.Start.Character) + endChar = utf32ToByteOffset(endLineContent, edit.Range.End.Character) + } + // Create result slice with initial capacity result := make([]string, 0, len(lines)) @@ -149,7 +168,7 @@ func applyTextEdit(lines []string, edit protocol.TextEdit) ([]string, error) { } // applyDocumentChange applies a DocumentChange (create/rename/delete operations) -func applyDocumentChange(change protocol.DocumentChange) error { +func applyDocumentChange(change protocol.DocumentChange, encoding powernap.OffsetEncoding) error { if change.CreateFile != nil { path, err := change.CreateFile.URI.Path() if err != nil { @@ -222,24 +241,42 @@ func applyDocumentChange(change protocol.DocumentChange) error { return fmt.Errorf("invalid edit type: %w", err) } } - return applyTextEdits(change.TextDocumentEdit.TextDocument.URI, textEdits) + return applyTextEdits(change.TextDocumentEdit.TextDocument.URI, textEdits, encoding) } return nil } -// ApplyWorkspaceEdit applies the given WorkspaceEdit to the filesystem -func ApplyWorkspaceEdit(edit protocol.WorkspaceEdit) error { +// utf32ToByteOffset converts a UTF-32 codepoint offset to a byte offset. +func utf32ToByteOffset(lineText string, codepointOffset uint32) int { + if codepointOffset == 0 { + return 0 + } + + var codepointCount uint32 + for byteOffset := range lineText { + if codepointCount >= codepointOffset { + return byteOffset + } + codepointCount++ + } + return len(lineText) +} + +// ApplyWorkspaceEdit applies the given WorkspaceEdit to the filesystem. +// The encoding parameter specifies the position encoding used by the LSP server +// (UTF8, UTF16, or UTF32). This affects how character offsets are interpreted. +func ApplyWorkspaceEdit(edit protocol.WorkspaceEdit, encoding powernap.OffsetEncoding) error { // Handle Changes field for uri, textEdits := range edit.Changes { - if err := applyTextEdits(uri, textEdits); err != nil { + if err := applyTextEdits(uri, textEdits, encoding); err != nil { return fmt.Errorf("failed to apply text edits: %w", err) } } // Handle DocumentChanges field for _, change := range edit.DocumentChanges { - if err := applyDocumentChange(change); err != nil { + if err := applyDocumentChange(change, encoding); err != nil { return fmt.Errorf("failed to apply document change: %w", err) } } diff --git a/internal/lsp/util/edit_test.go b/internal/lsp/util/edit_test.go index cb32d3498289dbf4cdb69dfc9ca4ee73f10b7a20..8e7d12fc9cffedffb8b701f0a8c8040f162327fa 100644 --- a/internal/lsp/util/edit_test.go +++ b/internal/lsp/util/edit_test.go @@ -3,11 +3,319 @@ package util import ( "testing" - "github.com/stretchr/testify/require" - + powernap "github.com/charmbracelet/x/powernap/pkg/lsp" "github.com/charmbracelet/x/powernap/pkg/lsp/protocol" + "github.com/stretchr/testify/require" ) +func TestPositionToByteOffset(t *testing.T) { + tests := []struct { + name string + lineText string + utf16Char uint32 + expected int + }{ + { + name: "ASCII only", + lineText: "hello world", + utf16Char: 6, + expected: 6, + }, + { + name: "CJK characters (3 bytes each in UTF-8, 1 UTF-16 unit)", + lineText: "你好world", + utf16Char: 2, + expected: 6, + }, + { + name: "CJK - position after CJK", + lineText: "var x = \"你好world\"", + utf16Char: 11, + expected: 15, + }, + { + name: "Emoji (4 bytes in UTF-8, 2 UTF-16 units)", + lineText: "👋hello", + utf16Char: 2, + expected: 4, + }, + { + name: "Multiple emoji", + lineText: "👋👋world", + utf16Char: 4, + expected: 8, + }, + { + name: "Mixed content", + lineText: "Hello👋你好", + utf16Char: 8, + expected: 12, + }, + { + name: "Position 0", + lineText: "hello", + utf16Char: 0, + expected: 0, + }, + { + name: "Position beyond end", + lineText: "hi", + utf16Char: 100, + expected: 2, + }, + { + name: "Empty string", + lineText: "", + utf16Char: 0, + expected: 0, + }, + { + name: "Surrogate pair at start", + lineText: "𐐷hello", + utf16Char: 2, + expected: 4, + }, + { + name: "ZWJ family emoji (1 grapheme, 7 runes, 11 UTF-16 units)", + lineText: "hello👨\u200d👩\u200d👧\u200d👦world", + utf16Char: 16, + expected: 30, + }, + { + name: "ZWJ family emoji - offset into middle of grapheme cluster", + lineText: "hello👨\u200d👩\u200d👧\u200d👦world", + utf16Char: 8, + expected: 12, + }, + { + name: "Flag emoji (1 grapheme, 2 runes, 4 UTF-16 units)", + lineText: "hello🇺🇸world", + utf16Char: 9, + expected: 13, + }, + { + name: "Combining character (1 grapheme, 2 runes, 2 UTF-16 units)", + lineText: "caf\u0065\u0301!", + utf16Char: 5, + expected: 6, + }, + { + name: "Skin tone modifier (1 grapheme, 2 runes, 4 UTF-16 units)", + lineText: "hi👋🏽bye", + utf16Char: 6, + expected: 10, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := powernap.PositionToByteOffset(tt.lineText, tt.utf16Char) + if result != tt.expected { + t.Errorf("PositionToByteOffset(%q, %d) = %d, want %d", + tt.lineText, tt.utf16Char, result, tt.expected) + } + }) + } +} + +func TestApplyTextEdit_UTF16(t *testing.T) { + // Test that UTF-16 offsets are correctly converted to byte offsets + tests := []struct { + name string + lines []string + edit protocol.TextEdit + expected []string + }{ + { + name: "ASCII only - no conversion needed", + lines: []string{"hello world"}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + Start: protocol.Position{Line: 0, Character: 6}, + End: protocol.Position{Line: 0, Character: 11}, + }, + NewText: "universe", + }, + expected: []string{"hello universe"}, + }, + { + name: "CJK characters - edit after Chinese characters", + lines: []string{`var x = "你好world"`}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + // "你好" = 2 UTF-16 units, but 6 bytes in UTF-8 + // Position 11 is where "world" starts in UTF-16 + Start: protocol.Position{Line: 0, Character: 11}, + End: protocol.Position{Line: 0, Character: 16}, + }, + NewText: "universe", + }, + expected: []string{`var x = "你好universe"`}, + }, + { + name: "Emoji - edit after emoji (2 UTF-16 units)", + lines: []string{`fmt.Println("👋hello")`}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + // 👋 = 2 UTF-16 units, 4 bytes in UTF-8 + // Position 15 is where "hello" starts in UTF-16 + Start: protocol.Position{Line: 0, Character: 15}, + End: protocol.Position{Line: 0, Character: 20}, + }, + NewText: "world", + }, + expected: []string{`fmt.Println("👋world")`}, + }, + { + name: "ZWJ family emoji - edit after grapheme cluster", + // "hello👨‍👩‍👧‍👦world" — family is 1 grapheme but 11 UTF-16 units + lines: []string{"hello\U0001F468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466world"}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + // "hello" = 5 UTF-16 units, family = 11 UTF-16 units + // "world" starts at UTF-16 offset 16 + Start: protocol.Position{Line: 0, Character: 16}, + End: protocol.Position{Line: 0, Character: 21}, + }, + NewText: "earth", + }, + expected: []string{"hello\U0001F468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466earth"}, + }, + { + name: "ZWJ family emoji - edit splits grapheme cluster in half", + // LSP servers can position into the middle of a grapheme cluster. + // After "hello" (5 UTF-16 units), the ZWJ family emoji starts. + // UTF-16 offset 7 lands between 👨 (2 units) and ZWJ, inside + // the grapheme cluster. The byte offset for position 7 is 9 + // (5 bytes for "hello" + 4 bytes for 👨). + lines: []string{"hello\U0001F468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466world"}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + Start: protocol.Position{Line: 0, Character: 7}, + End: protocol.Position{Line: 0, Character: 16}, + }, + NewText: "", + }, + // Keeps "hello" + 👨 (first rune of cluster) then removes + // the rest of the cluster, leaving "hello👨world". + expected: []string{"hello\U0001F468world"}, + }, + { + name: "Flag emoji - edit after flag", + // 🇺🇸 = 2 regional indicator runes, 4 UTF-16 units, 8 bytes + lines: []string{"hello🇺🇸world"}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + Start: protocol.Position{Line: 0, Character: 9}, + End: protocol.Position{Line: 0, Character: 14}, + }, + NewText: "earth", + }, + expected: []string{"hello🇺🇸earth"}, + }, + { + name: "Combining accent - edit after composed character", + // "café!" where é = e + U+0301 (2 code points, 2 UTF-16 units) + lines: []string{"caf\u0065\u0301!"}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + // "caf" = 3, "e" = 1, U+0301 = 1, total = 5 UTF-16 units + Start: protocol.Position{Line: 0, Character: 5}, + End: protocol.Position{Line: 0, Character: 6}, + }, + NewText: "?", + }, + expected: []string{"caf\u0065\u0301?"}, + }, + { + name: "Skin tone modifier - edit after modified emoji", + // 👋🏽 = U+1F44B U+1F3FD = 2 runes, 4 UTF-16 units, 8 bytes + lines: []string{"hi👋🏽bye"}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + // "hi" = 2, 👋🏽 = 4, total = 6 UTF-16 units + Start: protocol.Position{Line: 0, Character: 6}, + End: protocol.Position{Line: 0, Character: 9}, + }, + NewText: "later", + }, + expected: []string{"hi👋🏽later"}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := applyTextEdit(tt.lines, tt.edit, powernap.UTF16) + if err != nil { + t.Fatalf("applyTextEdit failed: %v", err) + } + if len(result) != len(tt.expected) { + t.Errorf("expected %d lines, got %d: %v", len(tt.expected), len(result), result) + return + } + for i := range result { + if result[i] != tt.expected[i] { + t.Errorf("line %d: expected %q, got %q", i, tt.expected[i], result[i]) + } + } + }) + } +} + +func TestApplyTextEdit_UTF8(t *testing.T) { + // Test that UTF-8 offsets are used directly without conversion + tests := []struct { + name string + lines []string + edit protocol.TextEdit + expected []string + }{ + { + name: "ASCII only - direct byte offset", + lines: []string{"hello world"}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + Start: protocol.Position{Line: 0, Character: 6}, + End: protocol.Position{Line: 0, Character: 11}, + }, + NewText: "universe", + }, + expected: []string{"hello universe"}, + }, + { + name: "CJK characters - byte offset used directly", + lines: []string{`var x = "你好world"`}, + edit: protocol.TextEdit{ + Range: protocol.Range{ + // With UTF-8 encoding, position 15 is the byte offset + Start: protocol.Position{Line: 0, Character: 15}, + End: protocol.Position{Line: 0, Character: 20}, + }, + NewText: "universe", + }, + expected: []string{`var x = "你好universe"`}, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := applyTextEdit(tt.lines, tt.edit, powernap.UTF8) + if err != nil { + t.Fatalf("applyTextEdit failed: %v", err) + } + if len(result) != len(tt.expected) { + t.Errorf("expected %d lines, got %d: %v", len(tt.expected), len(result), result) + return + } + for i := range result { + if result[i] != tt.expected[i] { + t.Errorf("line %d: expected %q, got %q", i, tt.expected[i], result[i]) + } + } + }) + } +} + func TestRangesOverlap(t *testing.T) { t.Parallel()