-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
gopls/internal/util/asm: better assembly parsing
This CL adds a rudimentary parser for symbols in Go .s files. It is a placeholder for a more principled implementation, but it is sufficient to make Definition support control labels (also in this CL) and for a cross-references index (future work). + test of Definition on control label + test of asm.Parse Updates golang/go#71754 Change-Id: I2ff19b4ade130c051197d6b097a1a3dbcd95555a Reviewed-on: https://go-review.googlesource.com/c/tools/+/654335 LUCI-TryBot-Result: Go LUCI <[email protected]> Reviewed-by: Jonathan Amsterdam <[email protected]> Auto-Submit: Alan Donovan <[email protected]>
- Loading branch information
Showing
5 changed files
with
353 additions
and
24 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,245 @@ | ||
// Copyright 2025 The Go Authors. All rights reserved. | ||
// Use of this source code is governed by a BSD-style | ||
// license that can be found in the LICENSE file. | ||
|
||
// Package asm provides a simple parser for Go assembly files. | ||
package asm | ||
|
||
import ( | ||
"bufio" | ||
"bytes" | ||
"fmt" | ||
"strings" | ||
"unicode" | ||
) | ||
|
||
// Kind describes the nature of an identifier in an assembly file. | ||
type Kind uint8 | ||
|
||
const ( | ||
Invalid Kind = iota // reserved zero value; not used by Ident | ||
Ref // arbitrary reference to symbol or control label | ||
Text // definition of TEXT (function) symbol | ||
Global // definition of GLOBL (var) symbol | ||
Data // initialization of GLOBL (var) symbol; effectively a reference | ||
Label // definition of control label | ||
) | ||
|
||
func (k Kind) String() string { | ||
if int(k) < len(kindString) { | ||
return kindString[k] | ||
} | ||
return fmt.Sprintf("Kind(%d)", k) | ||
} | ||
|
||
var kindString = [...]string{ | ||
Invalid: "invalid", | ||
Ref: "ref", | ||
Text: "text", | ||
Global: "global", | ||
Data: "data", | ||
Label: "label", | ||
} | ||
|
||
// A file represents a parsed file of Go assembly language. | ||
type File struct { | ||
Idents []Ident | ||
|
||
// TODO(adonovan): use token.File? This may be important in a | ||
// future in which analyzers can report diagnostics in .s files. | ||
} | ||
|
||
// Ident represents an identifier in an assembly file. | ||
type Ident struct { | ||
Name string // symbol name (after correcting [·∕]); Name[0]='.' => current package | ||
Offset int // zero-based byte offset | ||
Kind Kind | ||
} | ||
|
||
// End returns the identifier's end offset. | ||
func (id Ident) End() int { return id.Offset + len(id.Name) } | ||
|
||
// Parse extracts identifiers from Go assembly files. | ||
// Since it is a best-effort parser, it never returns an error. | ||
func Parse(content []byte) *File { | ||
var idents []Ident | ||
offset := 0 // byte offset of start of current line | ||
|
||
// TODO(adonovan) use a proper tokenizer that respects | ||
// comments, string literals, line continuations, etc. | ||
scan := bufio.NewScanner(bytes.NewReader(content)) | ||
for ; scan.Scan(); offset += len(scan.Bytes()) + len("\n") { | ||
line := scan.Text() | ||
|
||
// Strip comments. | ||
if idx := strings.Index(line, "//"); idx >= 0 { | ||
line = line[:idx] | ||
} | ||
|
||
// Skip blank lines. | ||
if strings.TrimSpace(line) == "" { | ||
continue | ||
} | ||
|
||
// Check for label definitions (ending with colon). | ||
if colon := strings.IndexByte(line, ':'); colon > 0 { | ||
label := strings.TrimSpace(line[:colon]) | ||
if isIdent(label) { | ||
idents = append(idents, Ident{ | ||
Name: label, | ||
Offset: offset + strings.Index(line, label), | ||
Kind: Label, | ||
}) | ||
continue | ||
} | ||
} | ||
|
||
// Split line into words. | ||
words := strings.Fields(line) | ||
if len(words) == 0 { | ||
continue | ||
} | ||
|
||
// A line of the form | ||
// TEXT ·sym<ABIInternal>(SB),NOSPLIT,$12 | ||
// declares a text symbol "·sym". | ||
if len(words) > 1 { | ||
kind := Invalid | ||
switch words[0] { | ||
case "TEXT": | ||
kind = Text | ||
case "GLOBL": | ||
kind = Global | ||
case "DATA": | ||
kind = Data | ||
} | ||
if kind != Invalid { | ||
sym := words[1] | ||
sym = cutBefore(sym, ",") // strip ",NOSPLIT,$12" etc | ||
sym = cutBefore(sym, "(") // "sym(SB)" -> "sym" | ||
sym = cutBefore(sym, "<") // "sym<ABIInternal>" -> "sym" | ||
sym = strings.TrimSpace(sym) | ||
if isIdent(sym) { | ||
// (The Index call assumes sym is not itself "TEXT" etc.) | ||
idents = append(idents, Ident{ | ||
Name: cleanup(sym), | ||
Kind: kind, | ||
Offset: offset + strings.Index(line, sym), | ||
}) | ||
} | ||
continue | ||
} | ||
} | ||
|
||
// Find references in the rest of the line. | ||
pos := 0 | ||
for _, word := range words { | ||
// Find actual position of word within line. | ||
tokenPos := strings.Index(line[pos:], word) | ||
if tokenPos < 0 { | ||
panic(line) | ||
} | ||
tokenPos += pos | ||
pos = tokenPos + len(word) | ||
|
||
// Reject probable instruction mnemonics (e.g. MOV). | ||
if len(word) >= 2 && word[0] != '·' && | ||
!strings.ContainsFunc(word, unicode.IsLower) { | ||
continue | ||
} | ||
|
||
if word[0] == '$' { | ||
word = word[1:] | ||
tokenPos++ | ||
|
||
// Reject probable immediate values (e.g. "$123"). | ||
if !strings.ContainsFunc(word, isNonDigit) { | ||
continue | ||
} | ||
} | ||
|
||
// Reject probably registers (e.g. "PC"). | ||
if len(word) <= 3 && !strings.ContainsFunc(word, unicode.IsLower) { | ||
continue | ||
} | ||
|
||
// Probable identifier reference. | ||
// | ||
// TODO(adonovan): handle FP symbols correctly; | ||
// sym+8(FP) is essentially a comment about | ||
// stack slot 8, not a reference to a symbol | ||
// with a declaration somewhere; so they form | ||
// an equivalence class without a canonical | ||
// declaration. | ||
// | ||
// TODO(adonovan): handle pseudoregisters and field | ||
// references such as: | ||
// MOVD $runtime·g0(SB), g // pseudoreg | ||
// MOVD R0, g_stackguard0(g) // field ref | ||
|
||
sym := cutBefore(word, "(") // "·sym(SB)" => "sym" | ||
sym = cutBefore(sym, "+") // "sym+8(FP)" => "sym" | ||
sym = cutBefore(sym, "<") // "sym<ABIInternal>" =>> "sym" | ||
if isIdent(sym) { | ||
idents = append(idents, Ident{ | ||
Name: cleanup(sym), | ||
Kind: Ref, | ||
Offset: offset + tokenPos, | ||
}) | ||
} | ||
} | ||
} | ||
|
||
_ = scan.Err() // ignore scan errors | ||
|
||
return &File{Idents: idents} | ||
} | ||
|
||
// isIdent reports whether s is a valid Go assembly identifier. | ||
func isIdent(s string) bool { | ||
for i, r := range s { | ||
if !isIdentRune(r, i) { | ||
return false | ||
} | ||
} | ||
return len(s) > 0 | ||
} | ||
|
||
// cutBefore returns the portion of s before the first occurrence of sep, if any. | ||
func cutBefore(s, sep string) string { | ||
if before, _, ok := strings.Cut(s, sep); ok { | ||
return before | ||
} | ||
return s | ||
} | ||
|
||
// cleanup converts a symbol name from assembler syntax to linker syntax. | ||
func cleanup(sym string) string { | ||
return repl.Replace(sym) | ||
} | ||
|
||
var repl = strings.NewReplacer( | ||
"·", ".", // (U+00B7 MIDDLE DOT) | ||
"∕", "/", // (U+2215 DIVISION SLASH) | ||
) | ||
|
||
func isNonDigit(r rune) bool { return !unicode.IsDigit(r) } | ||
|
||
// -- plundered from GOROOT/src/cmd/asm/internal/asm/parse.go -- | ||
|
||
// We want center dot (·) and division slash (∕) to work as identifier characters. | ||
func isIdentRune(ch rune, i int) bool { | ||
if unicode.IsLetter(ch) { | ||
return true | ||
} | ||
switch ch { | ||
case '_': // Underscore; traditional. | ||
return true | ||
case '\u00B7': // Represents the period in runtime.exit. U+00B7 '·' middle dot | ||
return true | ||
case '\u2215': // Represents the slash in runtime/debug.setGCPercent. U+2215 '∕' division slash | ||
return true | ||
} | ||
// Digits are OK only after the first character. | ||
return i > 0 && unicode.IsDigit(ch) | ||
} |
Oops, something went wrong.