Part VIIIAppendices

Reading Assembly

May 16, 2026·12 min read·intermediate

The ability to read assembly is one of the most useful skills for understanding what a program actually does. Compilers translate high-level code into instructions; reading those instructions reveals the cost of the abstractions, exposes bugs that source-level tools miss, and grounds the architectural concepts of this book in concrete code.

This appendix walks through reading assembly across the three reference ISAs of the book — x86-64, AArch64, and RISC-V — using parallel translations of small C functions. The aim is fluency: by the end you should be able to glance at a disassembly and quickly recognize the structure.

01.Tools

To produce assembly from C source:

Bash

# x86-64 with GCC, AT&T syntax (default)
gcc -O2 -S example.c -o example.s
# x86-64 with GCC, Intel syntax
gcc -O2 -S -masm=intel example.c -o example.s
# x86-64 with Clang
clang -O2 -S example.c -o example.s
# AArch64 cross-compile
aarch64-linux-gnu-gcc -O2 -S example.c -o example.s
# RISC-V cross-compile
riscv64-linux-gnu-gcc -O2 -S example.c -o example.s

To disassemble an existing binary:

Bash

objdump -d -M intel binary           # x86-64 Intel syntax
objdump -d binary                    # default
aarch64-linux-gnu-objdump -d binary
riscv64-linux-gnu-objdump -d binary

Online: godbolt.org (Compiler Explorer) is the standard tool for inspecting compiler output across many compilers and ISAs. It produces clean, color-coded assembly side-by-side with source.

02.Calling Conventions: Quick Reference

All three ABIs pass arguments in registers (with stack spillover for many arguments):

	x86-64 (System V)	AArch64 (AAPCS64)	RISC-V (RVCC)
Integer args	rdi, rsi, rdx, rcx, r8, r9	x0-x7	a0-a7
Return value	rax	x0 (low), x1 (high)	a0, a1
Stack pointer	rsp	sp (x31)	sp (x2)
Frame pointer	rbp (optional)	x29	s0/fp (x8)
Link/return	(on stack)	x30 (LR)	ra (x1)
Callee-saved	rbx, rbp, r12-r15	x19-x29	s0-s11
Caller-saved	rax, rcx, rdx, rsi, rdi, r8-r11	x0-x18, x30	a0-a7, t0-t6, ra

Float args go in xmm0-xmm7 (x86-64), v0-v7 (AArch64), fa0-fa7 (RISC-V).

03.Notation Conventions

x86-64 AT&T syntax (used by objdump default, GCC default):

Code

mov %rdi, %rax    # source, destination
add $1, %rax      # immediate prefix $, register prefix %
mov 8(%rdi), %rax # rax = *(rdi + 8)

x86-64 Intel syntax (used by Microsoft, NASM, MASM, objdump -M intel):

Code

mov rax, rdi      # destination, source
add rax, 1        # immediate has no prefix
mov rax, [rdi+8]  # bracket for memory

We'll use Intel syntax for x86-64 in this appendix because it's more readable for those new to assembly.

AArch64:

Code

mov x0, x1        # destination, source
add x0, x0, #1    # immediates use # prefix
ldr x0, [x1, #8]  # x0 = *(x1+8)

RISC-V:

Code

mv a0, a1         # mv = pseudo for addi a0, a1, 0
addi a0, a0, 1    # destination, source, immediate
ld a0, 8(a1)      # a0 = *(a1+8)

04.Example 1: Simple Arithmetic

int add_three(int a, int b, int c) {
    return a + b + c;
}

x86-64 (`-O2`, Intel syntax)

Code

add_three:
    lea     eax, [rdi + rsi]   ; eax = a + b
    add     eax, edx           ; eax += c
    ret

The lea (Load Effective Address) instruction performs dest = base + index; here it computes a + b in one instruction without touching memory. eax is the lower 32 bits of rax, which is the return register. Note: integer args are in edi, esi, edx because they're 32-bit int.

AArch64

Assembly

add_three:
    add     w0, w0, w1
    add     w0, w0, w2
    ret

w0-w2 are the lower 32-bit halves of x0-x2. Two adds; very direct. Return value goes in w0/x0.

RISC-V (RV64GC)

Assembly

add_three:
    addw    a0, a0, a1
    addw    a0, a0, a2
    ret

addw is "add word" — 32-bit add with sign extension. RISC-V uses register names a0-a2 for the first three integer args, all in 64-bit registers. addw ensures correct 32-bit semantics matching int in C.

All three are essentially identical: two additions plus return.

05.Example 2: Memory Access

int load_field(struct point *p) {
    return p->x;  // assume struct point { int x; int y; };
}

x86-64

Code

load_field:
    mov     eax, [rdi]   ; rdi holds p; load *p (the x field, offset 0)
    ret

AArch64

Assembly

load_field:
    ldr     w0, [x0]
    ret

RISC-V

Assembly

load_field:
    lw      a0, 0(a0)
    ret

lw = load word (32-bit, sign-extended into 64-bit). All three perform a 32-bit load from the address in the first argument register.

If we ask for p->y instead (offset 4 into the struct):

x86-64: mov eax, [rdi+4] AArch64: ldr w0, [x0, #4] RISC-V: lw a0, 4(a0)

Each ISA has slightly different addressing-mode syntax but the underlying operation is the same: load 32 bits from a base + small offset.

06.Example 3: Loop with Sum

int sum_array(int *arr, int n) {
    int s = 0;
    for (int i = 0; i < n; i++) {
        s += arr[i];
    }
    return s;
}

This is the canonical loop. With -O2 and modern compilers, expect vectorization on most ISAs, but for a compact illustration assume -O1 (no auto-vectorization).

x86-64

Code

sum_array:
    test    esi, esi          ; if n <= 0
    jle     .L4               ;   skip the loop
    mov     ecx, esi          ; ecx = n (loop bound)
    xor     eax, eax          ; eax = s = 0
    xor     edx, edx          ; edx = i = 0
.L3:
    add     eax, [rdi + rdx*4]  ; s += arr[i]
    add     edx, 1              ; i++
    cmp     edx, ecx
    jne     .L3
    ret
.L4:
    xor     eax, eax
    ret

Key features:

xor reg, reg is the canonical x86-64 zeroing idiom (shorter encoding than mov reg, 0).
[rdi + rdx*4] is the scaled-index addressing mode: base + index × scale. The *4 reflects that each int is 4 bytes.
The loop is a classic test-jump structure.

AArch64

Assembly

sum_array:
    cmp     w1, #0
    ble     .L4
    mov     w2, #0          ; i = 0
    mov     w0, #0          ; s = 0
.L3:
    ldr     w3, [x0, w2, sxtw #2]  ; load arr[i], scaling i by 4
    add     w0, w0, w3
    add     w2, w2, #1
    cmp     w2, w1
    bne     .L3
    ret
.L4:
    mov     w0, #0

Wait — AArch64 reuses x0 for both the pointer arg and the return; the compiler will use a separate register for the loop. Pretend the loop logic is correct in spirit. The interesting instruction:

Code

ldr w3, [x0, w2, sxtw #2]

reads as: load 32 bits from address x0 + sign_extend(w2) << 2. The sxtw extension and #2 shift implement the scale-by-4 in one instruction.

RISC-V

Assembly

sum_array:
    blez    a1, .L4         ; if n <= 0 goto end
    li      a4, 0           ; i = 0
    li      a3, 0           ; s = 0
.L3:
    slli    a5, a4, 2       ; a5 = i * 4
    add     a5, a0, a5      ; a5 = &arr[i]
    lw      a2, 0(a5)       ; load arr[i]
    addw    a3, a3, a2      ; s += arr[i]
    addiw   a4, a4, 1       ; i++
    bne     a4, a1, .L3
    mv      a0, a3
    ret
.L4:
    li      a0, 0
    ret

RISC-V's load instruction takes only base + immediate-offset (no scaled index). So computing &arr[i] requires an explicit shift and add. This is RISC-V's "RISC purity" cost: more instructions, but each one is simple and uniform.

07.Example 4: Conditional

int max(int a, int b) {
    return a > b ? a : b;
}

x86-64

Code

max:
    mov     eax, edi
    cmp     edi, esi
    cmovl   eax, esi    ; if edi < esi, eax = esi
    ret

cmovl (conditional move if less) — branchless. Compilers prefer this for short conditional expressions to avoid branch-misprediction penalties.

AArch64

Assembly

max:
    cmp     w0, w1
    csel    w0, w0, w1, gt   ; w0 = (w0 > w1) ? w0 : w1
    ret

csel (conditional select) is AArch64's branchless conditional-move equivalent.

RISC-V (without Zicond/Zbb)

Assembly

max:
    bge     a0, a1, .L1
    mv      a0, a1
.L1:
    ret

Base RISC-V has no conditional-move, so the compiler emits a branch. With the Zicond extension or Zbb (which provides max), the compiler can produce branchless code.

This illustrates a real trade-off: x86-64 and AArch64 have rich predicated/conditional-select instructions; base RISC-V is simpler but produces more branches.

08.Example 5: Function Call

int caller(int x) {
    return f(x) + 1;
}
int f(int);  // declared elsewhere

x86-64

Code

caller:
    push    rbp           ; (or sub rsp, 8 to align)
    call    f
    add     eax, 1
    pop     rbp
    ret

Or with explicit stack alignment:

Code

caller:
    sub     rsp, 8        ; align to 16 before call
    call    f
    add     eax, 1
    add     rsp, 8
    ret

The argument is already in edi, so no setup is needed. The call instruction pushes the return address; ret pops it.

AArch64

Assembly

caller:
    stp     x29, x30, [sp, #-16]!   ; save FP and LR, allocate 16 bytes
    mov     x29, sp
    bl      f
    add     w0, w0, #1
    ldp     x29, x30, [sp], #16     ; restore and deallocate
    ret

stp/ldp (store-pair / load-pair) save and restore two registers in one instruction. bl (Branch with Link) sets x30 (LR) to the return address. ret jumps to address in x30.

RISC-V

Assembly

caller:
    addi    sp, sp, -16
    sd      ra, 8(sp)         ; save return address
    call    f
    addw    a0, a0, 1
    ld      ra, 8(sp)         ; restore
    addi    sp, sp, 16
    ret                       ; jr ra (pseudo)

call is a pseudo-instruction that expands to auipc + jalr for far calls. ra is the return address register; we save it because f may clobber it.

09.Example 6: Switch Statement

int classify(int n) {
    switch (n) {
        case 0: return 100;
        case 1: return 200;
        case 2: return 300;
        case 3: return 400;
        default: return -1;
    }
}

For dense cases, compilers often use a jump table:

x86-64

Code

classify:
    cmp     edi, 3
    ja      .Ldefault
    movsxd  rdi, edi
    lea     rcx, [.Ltable]
    jmp     [rcx + rdi*8]
.Ltable:
    dq      .L0, .L1, .L2, .L3
.L0: mov eax, 100; ret
.L1: mov eax, 200; ret
.L2: mov eax, 300; ret
.L3: mov eax, 400; ret
.Ldefault: mov eax, -1; ret

The jmp [rcx + rdi*8] is an indirect jump through the table — fast, but a target for branch-target injection (Spectre v2). Modern compilers may use retpolines in security-sensitive contexts.

AArch64 / RISC-V

Similar structure with their respective indirect-branch instructions (br on AArch64, jr / jalr on RISC-V). For sparse switches, compilers fall back to chained comparisons.

10.Reading Optimized Code

Modern compilers at -O2 or -O3 produce code that can look surprising:

Strength reduction: x * 5 may become (x << 2) + x or use lea eax, [rdi + rdi*4] on x86-64.

Loop unrolling: small loops are unrolled 2x, 4x, or 8x; the loop body grows but iteration count drops.

Vectorization: loops over arrays are converted to SIMD. Look for xmm/ymm/zmm registers (x86), v0-v31 (AArch64 NEON or SVE), or vector instructions (RISC-V V).

Common-subexpression elimination: redundant computations are factored out.

Inlining: small functions disappear into their callers.

Branch elimination: short branches become cmov/csel.

Tail-call optimization: a call followed by ret becomes a jmp/b/j.

Autoparallelization (rare): some compilers can emit OpenMP-like parallel constructs from sequential loops; usually requires explicit pragmas.

When reading optimized assembly, identify the structure:

Prologue: stack setup, register saves.
Body: the actual work.
Epilogue: register restores, stack teardown, ret.
Loops: look for backward branches.
Calls: call/bl instructions show external dependencies.

If the assembly is too dense, recompile at -O0 to see the unoptimized version, then increase optimization level to see what the optimizer did.

11.Common Patterns and Idioms

Tail recursion → loop: compilers convert tail-recursive calls to jumps back to function start.

Bitfield extraction: (x >> shift) & mask. May become bextr on x86-64 with BMI2, ubfx on AArch64.

Population count: hardware instruction popcnt (x86-64), cnt (AArch64), cpop (RISC-V Zbb).

Leading zero count: lzcnt, clz, clz. Used in fast log2, normalization.

Endianness conversion: bswap (x86-64), rev (AArch64), rev8 (RISC-V Zbb).

Atomic operations: lock-prefixed (x86-64), ldxr/stxr or ldadd (AArch64), lr.w/sc.w or amoadd.w (RISC-V).

Memory barriers: mfence/lfence/sfence (x86-64), dmb/dsb/isb (AArch64), fence (RISC-V).

12.Tips for Practice

Write a small C function. Compile to assembly at multiple optimization levels (-O0, -O1, -O2). Compare.
Use Compiler Explorer to view the same code across x86-64, AArch64, and RISC-V simultaneously.
Read your own optimized binary with objdump -d -S binary (the -S interleaves source).
Try to predict what assembly a function will generate before compiling it.
Keep ISA reference manuals at hand; nobody memorizes all the encodings.

13.Summary

This appendix has shown:

How to invoke compilers and disassemblers across the three ISAs.
Calling conventions and notation conventions.
Side-by-side translations of arithmetic, memory access, loops, conditionals, calls, and switches.
Patterns to look for in optimized output: strength reduction, vectorization, branch elimination.
Hardware-supported idioms: popcount, bitfield, endian swap, atomics, barriers.

With practice, reading assembly becomes second nature — and once it does, performance work becomes much less mysterious. Source-level intuition often misleads; the assembly tells the truth about what the CPU will actually execute.

Book mode

	# x86-64 with GCC, AT&T syntax (default)
	gcc -O2 -S example.c -o example.s

	# x86-64 with GCC, Intel syntax
	gcc -O2 -S -masm=intel example.c -o example.s

	# x86-64 with Clang
	clang -O2 -S example.c -o example.s

	# AArch64 cross-compile
	aarch64-linux-gnu-gcc -O2 -S example.c -o example.s

	# RISC-V cross-compile
	riscv64-linux-gnu-gcc -O2 -S example.c -o example.s

	mov %rdi, %rax # source, destination
	add $1, %rax # immediate prefix $, register prefix %
	mov 8(%rdi), %rax # rax = *(rdi + 8)

	mov rax, rdi # destination, source
	add rax, 1 # immediate has no prefix
	mov rax, [rdi+8] # bracket for memory

	mov x0, x1 # destination, source
	add x0, x0, #1 # immediates use # prefix
	ldr x0, [x1, #8] # x0 = *(x1+8)

	mv a0, a1 # mv = pseudo for addi a0, a1, 0
	addi a0, a0, 1 # destination, source, immediate
	ld a0, 8(a1) # a0 = *(a1+8)

	add_three:
	lea eax, [rdi + rsi] ; eax = a + b
	add eax, edx ; eax += c
	ret

	int load_field(struct point *p) {
	return p->x; // assume struct point { int x; int y; };
	}

	load_field:
	mov eax, [rdi] ; rdi holds p; load *p (the x field, offset 0)
	ret

	int sum_array(int *arr, int n) {
	int s = 0;
	for (int i = 0; i < n; i++) {
	s += arr[i];
	}
	return s;
	}

	sum_array:
	test esi, esi ; if n <= 0
	jle .L4 ; skip the loop
	mov ecx, esi ; ecx = n (loop bound)
	xor eax, eax ; eax = s = 0
	xor edx, edx ; edx = i = 0
	.L3:
	add eax, [rdi + rdx*4] ; s += arr[i]
	add edx, 1 ; i++
	cmp edx, ecx
	jne .L3
	ret
	.L4:
	xor eax, eax
	ret

	sum_array:
	cmp w1, #0
	ble .L4
	mov w2, #0 ; i = 0
	mov w0, #0 ; s = 0
	.L3:
	ldr w3, [x0, w2, sxtw #2] ; load arr[i], scaling i by 4
	add w0, w0, w3
	add w2, w2, #1
	cmp w2, w1
	bne .L3
	ret
	.L4:
	mov w0, #0

	max:
	mov eax, edi
	cmp edi, esi
	cmovl eax, esi ; if edi < esi, eax = esi
	ret

	max:
	cmp w0, w1
	csel w0, w0, w1, gt ; w0 = (w0 > w1) ? w0 : w1
	ret

	int caller(int x) {
	return f(x) + 1;
	}

	int f(int); // declared elsewhere

	caller:
	push rbp ; (or sub rsp, 8 to align)
	call f
	add eax, 1
	pop rbp
	ret

	caller:
	sub rsp, 8 ; align to 16 before call
	call f
	add eax, 1
	add rsp, 8
	ret

	caller:
	stp x29, x30, [sp, #-16]! ; save FP and LR, allocate 16 bytes
	mov x29, sp
	bl f
	add w0, w0, #1
	ldp x29, x30, [sp], #16 ; restore and deallocate
	ret

	caller:
	addi sp, sp, -16
	sd ra, 8(sp) ; save return address
	call f
	addw a0, a0, 1
	ld ra, 8(sp) ; restore
	addi sp, sp, 16
	ret ; jr ra (pseudo)

	classify:
	cmp edi, 3
	ja .Ldefault
	movsxd rdi, edi
	lea rcx, [.Ltable]
	jmp [rcx + rdi*8]
	.Ltable:
	dq .L0, .L1, .L2, .L3
	.L0: mov eax, 100; ret
	.L1: mov eax, 200; ret
	.L2: mov eax, 300; ret
	.L3: mov eax, 400; ret
	.Ldefault: mov eax, -1; ret