fix(lsp/edit): properly handle non-ascii chars (e.g. CJK) (#2325)

Christian Rocha created 1 month ago

Change summary

go.mod                         |   2 
go.sum                         |   4 
internal/lsp/client.go         |   2 
internal/lsp/handlers.go       |  25 +-
internal/lsp/util/edit.go      |  61 +++++-
internal/lsp/util/edit_test.go | 312 +++++++++++++++++++++++++++++++++++
6 files changed, 377 insertions(+), 29 deletions(-)

Detailed changes

go.mod 🔗

@@ -31,7 +31,7 @@ require (
 	github.com/charmbracelet/x/exp/ordered v0.1.0
 	github.com/charmbracelet/x/exp/slice v0.0.0-20260209194814-eeb2896ac759
 	github.com/charmbracelet/x/exp/strings v0.1.0
-	github.com/charmbracelet/x/powernap v0.1.2
+	github.com/charmbracelet/x/powernap v0.1.3
 	github.com/charmbracelet/x/term v0.2.2
 	github.com/clipperhouse/displaywidth v0.11.0
 	github.com/clipperhouse/uax29/v2 v2.7.0

go.sum 🔗

@@ -120,8 +120,8 @@ github.com/charmbracelet/x/exp/strings v0.1.0 h1:i69S2XI7uG1u4NLGeJPSYU++Nmjvpo9
 github.com/charmbracelet/x/exp/strings v0.1.0/go.mod h1:/ehtMPNh9K4odGFkqYJKpIYyePhdp1hLBRvyY4bWkH8=
 github.com/charmbracelet/x/json v0.2.0 h1:DqB+ZGx2h+Z+1s98HOuOyli+i97wsFQIxP2ZQANTPrQ=
 github.com/charmbracelet/x/json v0.2.0/go.mod h1:opFIflx2YgXgi49xVUu8gEQ21teFAxyMwvOiZhIvWNM=
-github.com/charmbracelet/x/powernap v0.1.2 h1:5i0zWrqE5c9PY+H90DXg/5+jHNZ5FgXOyOU4dzOSIxs=
-github.com/charmbracelet/x/powernap v0.1.2/go.mod h1:cmdl5zlP5mR8TF2Y68UKc7hdGUDiSJ2+4hk0h04Hsx4=
+github.com/charmbracelet/x/powernap v0.1.3 h1:rmxdQelSPB1QAgLRNMLOrgCTq3q2RXoLOJ2ZTwKG17g=
+github.com/charmbracelet/x/powernap v0.1.3/go.mod h1:cmdl5zlP5mR8TF2Y68UKc7hdGUDiSJ2+4hk0h04Hsx4=
 github.com/charmbracelet/x/term v0.2.2 h1:xVRT/S2ZcKdhhOuSP4t5cLi5o+JxklsoEObBSgfgZRk=
 github.com/charmbracelet/x/term v0.2.2/go.mod h1:kF8CY5RddLWrsgVwpw4kAa6TESp6EB5y3uxGLeCqzAI=
 github.com/charmbracelet/x/termios v0.1.1 h1:o3Q2bT8eqzGnGPOYheoYS8eEleT5ZVNYNy8JawjaNZY=

internal/lsp/client.go 🔗

@@ -194,7 +194,7 @@ func (c *Client) createPowernapClient() error {
 
 // registerHandlers registers the standard LSP notification and request handlers.
 func (c *Client) registerHandlers() {
-	c.RegisterServerRequestHandler("workspace/applyEdit", HandleApplyEdit)
+	c.RegisterServerRequestHandler("workspace/applyEdit", HandleApplyEdit(c.client.GetOffsetEncoding()))
 	c.RegisterServerRequestHandler("workspace/configuration", HandleWorkspaceConfiguration)
 	c.RegisterServerRequestHandler("client/registerCapability", HandleRegisterCapability)
 	c.RegisterNotificationHandler("window/showMessage", func(ctx context.Context, method string, params json.RawMessage) {

internal/lsp/handlers.go 🔗

@@ -6,6 +6,7 @@ import (
 	"log/slog"
 
 	"github.com/charmbracelet/crush/internal/lsp/util"
+	powernap "github.com/charmbracelet/x/powernap/pkg/lsp"
 	"github.com/charmbracelet/x/powernap/pkg/lsp/protocol"
 )
 
@@ -44,19 +45,21 @@ func HandleRegisterCapability(_ context.Context, _ string, params json.RawMessag
 }
 
 // HandleApplyEdit handles workspace edit requests
-func HandleApplyEdit(_ context.Context, _ string, params json.RawMessage) (any, error) {
-	var edit protocol.ApplyWorkspaceEditParams
-	if err := json.Unmarshal(params, &edit); err != nil {
-		return nil, err
-	}
+func HandleApplyEdit(encoding powernap.OffsetEncoding) func(_ context.Context, _ string, params json.RawMessage) (any, error) {
+	return func(_ context.Context, _ string, params json.RawMessage) (any, error) {
+		var edit protocol.ApplyWorkspaceEditParams
+		if err := json.Unmarshal(params, &edit); err != nil {
+			return nil, err
+		}
 
-	err := util.ApplyWorkspaceEdit(edit.Edit)
-	if err != nil {
-		slog.Error("Error applying workspace edit", "error", err)
-		return protocol.ApplyWorkspaceEditResult{Applied: false, FailureReason: err.Error()}, nil
-	}
+		err := util.ApplyWorkspaceEdit(edit.Edit, encoding)
+		if err != nil {
+			slog.Error("Error applying workspace edit", "error", err)
+			return protocol.ApplyWorkspaceEditResult{Applied: false, FailureReason: err.Error()}, nil
+		}
 
-	return protocol.ApplyWorkspaceEditResult{Applied: true}, nil
+		return protocol.ApplyWorkspaceEditResult{Applied: true}, nil
+	}
 }
 
 // FileWatchRegistrationHandler is a function that will be called when file watch registrations are received

internal/lsp/util/edit.go 🔗

@@ -7,10 +7,11 @@ import (
 	"sort"
 	"strings"
 
+	powernap "github.com/charmbracelet/x/powernap/pkg/lsp"
 	"github.com/charmbracelet/x/powernap/pkg/lsp/protocol"
 )
 
-func applyTextEdits(uri protocol.DocumentURI, edits []protocol.TextEdit) error {
+func applyTextEdits(uri protocol.DocumentURI, edits []protocol.TextEdit, encoding powernap.OffsetEncoding) error {
 	path, err := uri.Path()
 	if err != nil {
 		return fmt.Errorf("invalid URI: %w", err)
@@ -57,7 +58,7 @@ func applyTextEdits(uri protocol.DocumentURI, edits []protocol.TextEdit) error {
 
 	// Apply each edit
 	for _, edit := range sortedEdits {
-		newLines, err := applyTextEdit(lines, edit)
+		newLines, err := applyTextEdit(lines, edit, encoding)
 		if err != nil {
 			return fmt.Errorf("failed to apply edit: %w", err)
 		}
@@ -85,13 +86,11 @@ func applyTextEdits(uri protocol.DocumentURI, edits []protocol.TextEdit) error {
 	return nil
 }
 
-func applyTextEdit(lines []string, edit protocol.TextEdit) ([]string, error) {
+func applyTextEdit(lines []string, edit protocol.TextEdit, encoding powernap.OffsetEncoding) ([]string, error) {
 	startLine := int(edit.Range.Start.Line)
 	endLine := int(edit.Range.End.Line)
-	startChar := int(edit.Range.Start.Character)
-	endChar := int(edit.Range.End.Character)
 
-	// Validate positions
+	// Validate positions before accessing lines.
 	if startLine < 0 || startLine >= len(lines) {
 		return nil, fmt.Errorf("invalid start line: %d", startLine)
 	}
@@ -99,6 +98,26 @@ func applyTextEdit(lines []string, edit protocol.TextEdit) ([]string, error) {
 		endLine = len(lines) - 1
 	}
 
+	var startChar, endChar int
+	switch encoding {
+	case powernap.UTF8:
+		// UTF-8: Character offset is already a byte offset
+		startChar = int(edit.Range.Start.Character)
+		endChar = int(edit.Range.End.Character)
+	case powernap.UTF16:
+		// UTF-16 (default): Convert to byte offset
+		startLineContent := lines[startLine]
+		endLineContent := lines[endLine]
+		startChar = powernap.PositionToByteOffset(startLineContent, edit.Range.Start.Character)
+		endChar = powernap.PositionToByteOffset(endLineContent, edit.Range.End.Character)
+	default:
+		// UTF-32: Character offset is codepoint count, convert to byte offset
+		startLineContent := lines[startLine]
+		endLineContent := lines[endLine]
+		startChar = utf32ToByteOffset(startLineContent, edit.Range.Start.Character)
+		endChar = utf32ToByteOffset(endLineContent, edit.Range.End.Character)
+	}
+
 	// Create result slice with initial capacity
 	result := make([]string, 0, len(lines))
 
@@ -149,7 +168,7 @@ func applyTextEdit(lines []string, edit protocol.TextEdit) ([]string, error) {
 }
 
 // applyDocumentChange applies a DocumentChange (create/rename/delete operations)
-func applyDocumentChange(change protocol.DocumentChange) error {
+func applyDocumentChange(change protocol.DocumentChange, encoding powernap.OffsetEncoding) error {
 	if change.CreateFile != nil {
 		path, err := change.CreateFile.URI.Path()
 		if err != nil {
@@ -222,24 +241,42 @@ func applyDocumentChange(change protocol.DocumentChange) error {
 				return fmt.Errorf("invalid edit type: %w", err)
 			}
 		}
-		return applyTextEdits(change.TextDocumentEdit.TextDocument.URI, textEdits)
+		return applyTextEdits(change.TextDocumentEdit.TextDocument.URI, textEdits, encoding)
 	}
 
 	return nil
 }
 
-// ApplyWorkspaceEdit applies the given WorkspaceEdit to the filesystem
-func ApplyWorkspaceEdit(edit protocol.WorkspaceEdit) error {
+// utf32ToByteOffset converts a UTF-32 codepoint offset to a byte offset.
+func utf32ToByteOffset(lineText string, codepointOffset uint32) int {
+	if codepointOffset == 0 {
+		return 0
+	}
+
+	var codepointCount uint32
+	for byteOffset := range lineText {
+		if codepointCount >= codepointOffset {
+			return byteOffset
+		}
+		codepointCount++
+	}
+	return len(lineText)
+}
+
+// ApplyWorkspaceEdit applies the given WorkspaceEdit to the filesystem.
+// The encoding parameter specifies the position encoding used by the LSP server
+// (UTF8, UTF16, or UTF32). This affects how character offsets are interpreted.
+func ApplyWorkspaceEdit(edit protocol.WorkspaceEdit, encoding powernap.OffsetEncoding) error {
 	// Handle Changes field
 	for uri, textEdits := range edit.Changes {
-		if err := applyTextEdits(uri, textEdits); err != nil {
+		if err := applyTextEdits(uri, textEdits, encoding); err != nil {
 			return fmt.Errorf("failed to apply text edits: %w", err)
 		}
 	}
 
 	// Handle DocumentChanges field
 	for _, change := range edit.DocumentChanges {
-		if err := applyDocumentChange(change); err != nil {
+		if err := applyDocumentChange(change, encoding); err != nil {
 			return fmt.Errorf("failed to apply document change: %w", err)
 		}
 	}

internal/lsp/util/edit_test.go 🔗

@@ -3,11 +3,319 @@ package util
 import (
 	"testing"
 
-	"github.com/stretchr/testify/require"
-
+	powernap "github.com/charmbracelet/x/powernap/pkg/lsp"
 	"github.com/charmbracelet/x/powernap/pkg/lsp/protocol"
+	"github.com/stretchr/testify/require"
 )
 
+func TestPositionToByteOffset(t *testing.T) {
+	tests := []struct {
+		name      string
+		lineText  string
+		utf16Char uint32
+		expected  int
+	}{
+		{
+			name:      "ASCII only",
+			lineText:  "hello world",
+			utf16Char: 6,
+			expected:  6,
+		},
+		{
+			name:      "CJK characters (3 bytes each in UTF-8, 1 UTF-16 unit)",
+			lineText:  "你好world",
+			utf16Char: 2,
+			expected:  6,
+		},
+		{
+			name:      "CJK - position after CJK",
+			lineText:  "var x = \"你好world\"",
+			utf16Char: 11,
+			expected:  15,
+		},
+		{
+			name:      "Emoji (4 bytes in UTF-8, 2 UTF-16 units)",
+			lineText:  "👋hello",
+			utf16Char: 2,
+			expected:  4,
+		},
+		{
+			name:      "Multiple emoji",
+			lineText:  "👋👋world",
+			utf16Char: 4,
+			expected:  8,
+		},
+		{
+			name:      "Mixed content",
+			lineText:  "Hello👋你好",
+			utf16Char: 8,
+			expected:  12,
+		},
+		{
+			name:      "Position 0",
+			lineText:  "hello",
+			utf16Char: 0,
+			expected:  0,
+		},
+		{
+			name:      "Position beyond end",
+			lineText:  "hi",
+			utf16Char: 100,
+			expected:  2,
+		},
+		{
+			name:      "Empty string",
+			lineText:  "",
+			utf16Char: 0,
+			expected:  0,
+		},
+		{
+			name:      "Surrogate pair at start",
+			lineText:  "𐐷hello",
+			utf16Char: 2,
+			expected:  4,
+		},
+		{
+			name:      "ZWJ family emoji (1 grapheme, 7 runes, 11 UTF-16 units)",
+			lineText:  "hello👨\u200d👩\u200d👧\u200d👦world",
+			utf16Char: 16,
+			expected:  30,
+		},
+		{
+			name:      "ZWJ family emoji - offset into middle of grapheme cluster",
+			lineText:  "hello👨\u200d👩\u200d👧\u200d👦world",
+			utf16Char: 8,
+			expected:  12,
+		},
+		{
+			name:      "Flag emoji (1 grapheme, 2 runes, 4 UTF-16 units)",
+			lineText:  "hello🇺🇸world",
+			utf16Char: 9,
+			expected:  13,
+		},
+		{
+			name:      "Combining character (1 grapheme, 2 runes, 2 UTF-16 units)",
+			lineText:  "caf\u0065\u0301!",
+			utf16Char: 5,
+			expected:  6,
+		},
+		{
+			name:      "Skin tone modifier (1 grapheme, 2 runes, 4 UTF-16 units)",
+			lineText:  "hi👋🏽bye",
+			utf16Char: 6,
+			expected:  10,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := powernap.PositionToByteOffset(tt.lineText, tt.utf16Char)
+			if result != tt.expected {
+				t.Errorf("PositionToByteOffset(%q, %d) = %d, want %d",
+					tt.lineText, tt.utf16Char, result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestApplyTextEdit_UTF16(t *testing.T) {
+	// Test that UTF-16 offsets are correctly converted to byte offsets
+	tests := []struct {
+		name     string
+		lines    []string
+		edit     protocol.TextEdit
+		expected []string
+	}{
+		{
+			name:  "ASCII only - no conversion needed",
+			lines: []string{"hello world"},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					Start: protocol.Position{Line: 0, Character: 6},
+					End:   protocol.Position{Line: 0, Character: 11},
+				},
+				NewText: "universe",
+			},
+			expected: []string{"hello universe"},
+		},
+		{
+			name:  "CJK characters - edit after Chinese characters",
+			lines: []string{`var x = "你好world"`},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					// "你好" = 2 UTF-16 units, but 6 bytes in UTF-8
+					// Position 11 is where "world" starts in UTF-16
+					Start: protocol.Position{Line: 0, Character: 11},
+					End:   protocol.Position{Line: 0, Character: 16},
+				},
+				NewText: "universe",
+			},
+			expected: []string{`var x = "你好universe"`},
+		},
+		{
+			name:  "Emoji - edit after emoji (2 UTF-16 units)",
+			lines: []string{`fmt.Println("👋hello")`},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					// 👋 = 2 UTF-16 units, 4 bytes in UTF-8
+					// Position 15 is where "hello" starts in UTF-16
+					Start: protocol.Position{Line: 0, Character: 15},
+					End:   protocol.Position{Line: 0, Character: 20},
+				},
+				NewText: "world",
+			},
+			expected: []string{`fmt.Println("👋world")`},
+		},
+		{
+			name: "ZWJ family emoji - edit after grapheme cluster",
+			// "hello👨‍👩‍👧‍👦world" — family is 1 grapheme but 11 UTF-16 units
+			lines: []string{"hello\U0001F468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466world"},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					// "hello" = 5 UTF-16 units, family = 11 UTF-16 units
+					// "world" starts at UTF-16 offset 16
+					Start: protocol.Position{Line: 0, Character: 16},
+					End:   protocol.Position{Line: 0, Character: 21},
+				},
+				NewText: "earth",
+			},
+			expected: []string{"hello\U0001F468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466earth"},
+		},
+		{
+			name: "ZWJ family emoji - edit splits grapheme cluster in half",
+			// LSP servers can position into the middle of a grapheme cluster.
+			// After "hello" (5 UTF-16 units), the ZWJ family emoji starts.
+			// UTF-16 offset 7 lands between 👨 (2 units) and ZWJ, inside
+			// the grapheme cluster. The byte offset for position 7 is 9
+			// (5 bytes for "hello" + 4 bytes for 👨).
+			lines: []string{"hello\U0001F468\u200d\U0001F469\u200d\U0001F467\u200d\U0001F466world"},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					Start: protocol.Position{Line: 0, Character: 7},
+					End:   protocol.Position{Line: 0, Character: 16},
+				},
+				NewText: "",
+			},
+			// Keeps "hello" + 👨 (first rune of cluster) then removes
+			// the rest of the cluster, leaving "hello👨world".
+			expected: []string{"hello\U0001F468world"},
+		},
+		{
+			name: "Flag emoji - edit after flag",
+			// 🇺🇸 = 2 regional indicator runes, 4 UTF-16 units, 8 bytes
+			lines: []string{"hello🇺🇸world"},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					Start: protocol.Position{Line: 0, Character: 9},
+					End:   protocol.Position{Line: 0, Character: 14},
+				},
+				NewText: "earth",
+			},
+			expected: []string{"hello🇺🇸earth"},
+		},
+		{
+			name: "Combining accent - edit after composed character",
+			// "café!" where é = e + U+0301 (2 code points, 2 UTF-16 units)
+			lines: []string{"caf\u0065\u0301!"},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					// "caf" = 3, "e" = 1, U+0301 = 1, total = 5 UTF-16 units
+					Start: protocol.Position{Line: 0, Character: 5},
+					End:   protocol.Position{Line: 0, Character: 6},
+				},
+				NewText: "?",
+			},
+			expected: []string{"caf\u0065\u0301?"},
+		},
+		{
+			name: "Skin tone modifier - edit after modified emoji",
+			// 👋🏽 = U+1F44B U+1F3FD = 2 runes, 4 UTF-16 units, 8 bytes
+			lines: []string{"hi👋🏽bye"},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					// "hi" = 2, 👋🏽 = 4, total = 6 UTF-16 units
+					Start: protocol.Position{Line: 0, Character: 6},
+					End:   protocol.Position{Line: 0, Character: 9},
+				},
+				NewText: "later",
+			},
+			expected: []string{"hi👋🏽later"},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := applyTextEdit(tt.lines, tt.edit, powernap.UTF16)
+			if err != nil {
+				t.Fatalf("applyTextEdit failed: %v", err)
+			}
+			if len(result) != len(tt.expected) {
+				t.Errorf("expected %d lines, got %d: %v", len(tt.expected), len(result), result)
+				return
+			}
+			for i := range result {
+				if result[i] != tt.expected[i] {
+					t.Errorf("line %d: expected %q, got %q", i, tt.expected[i], result[i])
+				}
+			}
+		})
+	}
+}
+
+func TestApplyTextEdit_UTF8(t *testing.T) {
+	// Test that UTF-8 offsets are used directly without conversion
+	tests := []struct {
+		name     string
+		lines    []string
+		edit     protocol.TextEdit
+		expected []string
+	}{
+		{
+			name:  "ASCII only - direct byte offset",
+			lines: []string{"hello world"},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					Start: protocol.Position{Line: 0, Character: 6},
+					End:   protocol.Position{Line: 0, Character: 11},
+				},
+				NewText: "universe",
+			},
+			expected: []string{"hello universe"},
+		},
+		{
+			name:  "CJK characters - byte offset used directly",
+			lines: []string{`var x = "你好world"`},
+			edit: protocol.TextEdit{
+				Range: protocol.Range{
+					// With UTF-8 encoding, position 15 is the byte offset
+					Start: protocol.Position{Line: 0, Character: 15},
+					End:   protocol.Position{Line: 0, Character: 20},
+				},
+				NewText: "universe",
+			},
+			expected: []string{`var x = "你好universe"`},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := applyTextEdit(tt.lines, tt.edit, powernap.UTF8)
+			if err != nil {
+				t.Fatalf("applyTextEdit failed: %v", err)
+			}
+			if len(result) != len(tt.expected) {
+				t.Errorf("expected %d lines, got %d: %v", len(tt.expected), len(result), result)
+				return
+			}
+			for i := range result {
+				if result[i] != tt.expected[i] {
+					t.Errorf("line %d: expected %q, got %q", i, tt.expected[i], result[i])
+				}
+			}
+		})
+	}
+}
+
 func TestRangesOverlap(t *testing.T) {
 	t.Parallel()