Assembly Crash Course

前言

这个模块是关于汇编语言编程的，要求设置寄存器rdi的值为0x1337。在x86-64架构的Linux系统中，rdi是第一个参数寄存器。

为了将0x1337这个值放入rdi寄存器，你可以使用以下的汇编代码：

复制代码

assembly

mov $0x1337, %rdi

这条指令将立即数0x1337移动到rdi寄存器中。
如果你需要将这条指令转换为字节码形式（即机器码），你可以使用任何支持x86-64汇编的汇编器，如GNU Assembler (GAS)。以下是如何使用GAS将上述汇编指令转换为字节码的步骤：

创建一个名为set_rdi.s的汇编源文件，其中包含以下内容：

assembly

section .text
global _start

_start:
mov $0x1337, %rdi ; 添加退出代码，以便程序能够正常退出 mov$ 60, %rax ; 系统调用号 (sys_exit)
xor %rdi, %rdi ; 退出状态码 0
syscall
使用GAS将汇编源文件编译为机器码：

bash
nasm -f elf64 set_rdi.s
链接生成的目标文件以创建可执行文件：

bash

ld -m elf_x86_64 -o set_rdi set_rdi.o

现在，set_rdi可执行文件包含了将0x1337设置到rdi寄存器的机器码。你可以使用objdump工具来查看这些机器码：

复制代码

bash

objdump -d ./set_rdi

这将显示反汇编的输出，其中包含了mov $0x1337, %rdi指令的机器码表示。注意，实际的机器码可能会因为不同的系统和编译器选项而有所不同。

1. 寄存器赋值： level 1~level 2

python 复制代码

from pwn import *
context.update(arch="amd64")
io = process("/challenge/run")
io.write(asm("""
    mov  rdi,0x1337   # Write your assembly code here
 """))
io.interactive()        #  print(io.readallS())

python 复制代码

mov  rax, 0x1337
mov  r12, 0xCAFED00D1337BEEF
mov  rsp, 0x31337

2. 加减乘除（模）：level 3~level 6

python 复制代码

add  rdi,0x331337

python 复制代码

imul  rdi,rsi  # rdi = rdi*rsi ; imul 有符号乘法，mul无符号乘法；
#       单操作数 imul xxx / mul xxx ==> rax*xxx,放在rax，作用于有符号乘数结果可能有溢出
add  rdi,rdx       
mov  rax,rdi

python 复制代码

# div 只有一个操作数。需要先把被除数放入rax寄存器，然后使用 div 除以 除数寄存器，商保存rax
rax,rdi 
div rsi 
#  rax = rdi/rsi

python 复制代码

xor rdx,rdx   # 先清空rdx内容
mov rax ,rdi    # 运算 rax➗rsi
div rsi
mov rax,rdx    #  div 运算余数放在 rdx寄存器

3. 通用数据寄存器高低位：level 7 ~ level 8

通用数据寄存器：rax，rbx，rcx，rdx

以rax寄存器为例：

MSB 【high】 LSB 【low】
±---------------------------------------+
| rax | 64 bit
±-------------------±------------------+

** | eax | 32 bit**

** ±--------±--------+**

** | ax | 16 bit**

** ±---±---+**

** | ah | al | 8 bit each**

** ±---±---+**

python 复制代码

mov     ah, 0x42

仅使用 mov 指令完成模运算

python 复制代码

mov rcx,rdi    
mov al,cl   # 注意mov前后要对等，mov rax,cl 会报错
mov rcx,rsi
mov bx,cx

4. 逻辑运算：level 9 ~ level 11

python 复制代码

#   reg2的数据表示移动位数【这里可以是立即数，如：shl rdi,24】
shl reg1, reg2       <=>     Shift reg1 left by the amount in reg2
shr reg1, reg2       <=>     Shift reg1 right by the amount in reg2

python 复制代码

  rdi = | B7 | B6 | B5 | B4 | B3 | B2 | B1 | B0 |
  Set rax to the value of B4

#  将 rdi 先左移 3*8=24位得到：
# | B4 | B3 | B2 | B1 | B0 | 0  | 0  | 0  |
# 然后右移 7*8 = 56 位得到：
#  | 0  | 0  | 0  | 0  | 0  | 0  | 0  | B4 |
#  最后赋值给 rax即可

shl rdi,24
shr rdi,56
mov rax ,rdi

python 复制代码

and rdi,rsi     # 不使用 mov指令，实现：rax = rdi and rsi
xor rax,rax
or rax,rdi

python 复制代码

Implement the following logic:
  if x is even then   # x 是 偶数，y = 1；反之 y=0
    y = 1
  else
    y = 0

where:
  x = rdi
  y = rax

python 复制代码

and rdi,0x1    # 取 rdi最低位：0/1  （偶/奇）
xor rdi,1       # 反转 ----->  1/0
xor rax,rax   #  清空 rax
or rax,rdi    # 赋值 rax

5. 内存<-->寄存器操作：level 12~level 14

python 复制代码

In x86 we can access the thing at a memory location, called dereferencing, like so:

mov rax, [some_address]   <=>     Moves the thing at 'some_address' into rax

This also works with things in registers:
mov rax, [rdi] <=> Moves the thing stored at the address of what rdi holds to rax

This works the same for writing:
mov [rax], rdi     <=>     Moves rdi to the address of what rax holds.

So if rax was 0xdeadbeef, then rdi would get stored at the address 0xdeadbeef:
[0xdeadbeef] = rdi

Note: memory is linear, and in x86, it goes from 0 - 0xffffffffffffffff (yes, huge).

python 复制代码

#取内存地址的值赋给rax寄存器：
mov rax,[0x404000]   #   mov     rax, qword ptr [0x404000]

# 将寄存器值写入内存地址:
 mov [0x404000],rax   #   mov     qword ptr [0x404000],rax

# 将指定地址内容赋值给 rax，接着该地址内容+0x1337
mov rax,[0x404000]
add rax,0x1337
mov [0x404000],rax
sub rax,0x1337
# [或者借助另外寄存器]：
mov rax,[0x404000]
mov rbx,rax
add rbx,0x1337
mov [0x404000],rbx

6. 内存操作：level 15~level 16

python 复制代码

Recall the following:
  The breakdown of the names of memory sizes:
    Quad Word   = 8 Bytes = 64 bits
    Double Word = 4 bytes = 32 bits
    Word        = 2 bytes = 16 bits
    Byte        = 1 byte  = 8 bits

In x86_64, you can access each of these sizes when dereferencing an address, just like using
bigger or smaller register accesses:
  mov al, [address]        <=>        moves the least significant byte from address to rax
  mov ax, [address]        <=>        moves the least significant word from address to rax
  mov eax, [address]       <=>        moves the least significant double word from address to rax
  mov rax, [address]       <=>        moves the full quad word from address to rax

python 复制代码

mov al, byte [0x403fff]   #  mov     al, byte ptr [0x404000]  
# 如果地址前面加上 byte等，汇编代码地址需要-1，执行时候会自动向后加一位开始赋值。

#  常规：
mov al,  [0x404000]
mov bx ,  [0x404000]
mov ecx ,[0x404000]
mov rdx , [0x404000]

# 加上byte...之后地址需要依此变形
mov al, byte [0x403fff]
mov bx , word [0x403ffe]
mov ecx ,[0x404000]
mov rdx , [0x404000]

mov al, byte [0x403fff]
mov bx , word [0x403ffe]
mov ecx ,dword [0x403ffc]
mov rdx ,qword [0x403ff8]

#  结果
---------------- CODE ----------------
0x400000:       mov     al, byte ptr [0x404000]
0x400007:       mov     bx, word ptr [0x404000]
0x40000f:       mov     ecx, dword ptr [0x404000]
0x400016:       mov     rdx, qword ptr [0x404000]
--------------------------------------

7. 寄存器地址赋值：level 17~level 18

python 复制代码

  Set [rdi] = 0xdeadbeef00001337
  Set [rsi] = 0xc0ffee0000

mov rax,0xdeadbeef00001337
mov [rdi],rax
mov rax, 0xc0ffee0000
mov [rsi],rax

python 复制代码

  [0x404190] = 0xd5729
  [0x404198] = 0xe37d2
  rdi = 0x404190    #  task：取出 rdi 与 rdi+8 执向地址数据，求和存入 rsi 指向地址
  rsi = 0x4047a8

    mov rax, [rdi+8]
    mov rbx,[rdi]
    add rbx,rax
    mov [rsi],rbx

8. 栈调用：level 19~level 21

python 复制代码

pop rax
sub rax,rdi
push rax
# 或者 
mov rax,[rsp]
sub rax,rdi
mov [rsp],rax

python 复制代码

    push rdi
    push rsi
    pop rdi
    pop rsi

python 复制代码

    mov rax, [rsp]
    add rax,[rsp+8]
    add rax,[rsp+16]
    add rax,[rsp+24]
    mov rdi,4
    div rdi
    push rax
---------------- CODE ----------------
0x400000:       mov     rax, qword ptr [rsp]
0x400004:       add     rax, qword ptr [rsp + 8]
0x400009:       add     rax, qword ptr [rsp + 0x10]
0x40000e:       add     rax, qword ptr [rsp + 0x18]
0x400013:       mov     rdi, 4
0x40001a:       div     rdi
0x40001d:       push    rax
--------------------------------------

#  下面汇编不能得到正确结果，可能栈没有对齐
    mov rax,[rsp]
    mov rbx,[rsp+8]
    mov rcx,[rsp+16]
    mov rdx,[rsp+24]
    mov rdi,4
    add rax,rbx
    add rax,rcx
    add rax,rdx
    div rdi
    push rax
---------------- CODE ----------------
0x400000:       mov     rax, qword ptr [rsp]
0x400004:       mov     rbx, qword ptr [rsp + 8]
0x400009:       mov     rcx, qword ptr [rsp + 0x10]
0x40000e:       mov     rdx, qword ptr [rsp + 0x18]
0x400013:       mov     rdi, 4
0x40001a:       add     rax, rbx
0x40001d:       add     rax, rcx
0x400020:       add     rax, rdx
0x400023:       div     rdi
0x400026:       push    rax
--------------------------------------

然而，还有另一种更有趣的除法方法。
除法使用shr

我们知道，字节中的每一位都是某个数字的 2 次方。

±--------------------------------------------------------------+

| 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |

| (2^7) | (2^6) | (2^5) | (2^4) | (2^3) | (2^2) | (2^1) | (2^0) |

±--------------------------------------------------------------+

上面字节的值是 1x（2^7），等于 128。

如果我们向右移动 2 位，我们会得到以下结果。

±--------------------------------------------------------------+

| 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |

| (2^7) | (2^6) | (2^5) | (2^4) | (2^3) | (2^2) | (2^1) | (2^0) |

±--------------------------------------------------------------+

字节的值现在是 1x（2^5），即 32。因此，我们基本上在不使用指令的情况下将数字除以 4。div

现在，我们只需要对存储的总和做同样的事情来找到平均值。rax

shr rax, 2

接下来，我们必须使用指令将平均值复制到堆栈上。push

push rax

堆栈将如下所示：

±------------------------+

RSP+0x20 | Quad Word A | <------ rbp

±------------------------+

RSP+0x18 | Quad Word B |

±------------------------+

RSP+0x10 | Quad Word C |

±------------------------+

RSP+0x08 | Quad Word D |

±------------------------+

RSP | Average | <------ rsp

±------------------------+

9. 程序跳转指令： level 22 ~ level 24

python 复制代码

# jmp 跳转绝对地址需要把地址放入寄存器接上jmp命令。

mov r12,0x403000
jmp r12

---------------- CODE ----------------
0x400071:       mov     r12, 0x403000
0x400078:       jmp     r12
--------------------------------------

间接跳转到所在地址后面0x51地址，并在该地址将 rax 赋值为 0x1。

汇编代码文件后缀：.S .asm；后期编译会进行预处理，.s文件属于中间文件，不进行预处理。

python 复制代码

.global _start    #  头文件-----
_start:           # -----------
.intel_syntax noprefix  # -----

_start:
        jmp next
        .rept 0x51
                nop
        .endr
next:
        mov rax,1

python 复制代码

as -o asm.o asm.S && objcopy -O binary --only-section=.text ./asm.o ./asm.bin && cat ./asm.bin | /challenge/run

python 复制代码

.global _start
_start:
.intel_syntax noprefix
_start:
        jmp next
        .rept 0x51
                nop
        .endr
next:
        mov rdi,[rsp]       #  注意 mov rdi,rsp 传递的是rsp地址而不是该地址保存的值
        mov rax,0x403000     # 因为 rsp作为指针，保存一个地址，地址保存数据
        jmp rax

10. 分支循环语句： level 25~level 28

bash 复制代码

Using the above knowledge, implement the following:
if [x] is 0x7f454c46:
y = [x+4] + [x+8] + [x+12]
  else if [x] is 0x00005A4D:
y = [x+4] - [x+8] - [x+12]
  else:
y = [x+4] * [x+8] * [x+12]

where:
x = rdi, y = rax.

shell 复制代码

mov eax,[rdi]      # 如果想使用 rax,rbx,rcx。可以试试xor清零
cmp eax,0x7f454c46
je one
nop
mov eax,[rdi]
cmp eax,0x00005A4D
je two
nop                 #  nop貌似不必要
mov eax,[rdi+4]
imul eax,[rdi+8]
imul eax,[rdi+12]
jmp done								# 使用je/jg等跳转进入分支语句之后，
                      #  注意执行完分支语句会按顺序向下执行，注意不要混淆执行流！！！
one:									#  在结尾放一个 done分支，每个if分支执行完之后跳转到后面可以避免
  mov eax,[rdi+4]
  add eax,[rdi+8]
  add eax,[rdi+12]
  jmp done
two:
mov eax,[rdi+4]
  sub eax,[rdi+8]
  sub eax,[rdi+12]
done:
and eax,eax

python 复制代码

mov ebx,[rdi+4]
mov ecx,[rdi+8]
mov edx,[rdi+12]
mov eax,[rdi]
cmp eax,0x7f454c46
je con1
nop
mov eax,[rdi]
cmp eax,0x00005A4D
je con2
nop
imul ebx,ecx
imul ebx,edx
jmp done
nop
con1:
add ebx,ecx
add ebx,edx
jmp done
nop
con2:
sub ebx,ecx
sub ebx,edx
done:
mov eax,ebx

level 26：基地址跳转，指令优化 (⭐)

shell 复制代码

In the above example, the jump table could look like:
  [0x1337] = address of do_thing_0
  [0x1337+0x8] = address of do_thing_1
  [0x1337+0x10] = address of do_thing_2
  [0x1337+0x18] = address of do_default_thing

Using the jump table, we can greatly reduce the amount of cmps we use.

Now all we need to check is if `number` is greater than 2.

If it is, always do:
  jmp [0x1337+0x18]
Otherwise:
  jmp [jump_table_address + number * 8]

Using the above knowledge, implement the following logic:
  if rdi is 0:
    jmp 0x403019
  else if rdi is 1:
    jmp 0x4030c9
  else if rdi is 2:
    jmp 0x4031d2
  else if rdi is 3:
    jmp 0x403287
  else:
    jmp 0x403378

Please do the above with the following constraints:
  Assume rdi will NOT be negative
  Use no more than 1 cmp instruction
  Use no more than 3 jumps (of any variant)
  We will provide you with the number to 'switch' on in rdi.
  We will provide you with a jump table base address in rsi.

Here is an example table:
  [0x40405b] = 0x403019 (addrs will change)
  [0x404063] = 0x4030c9
  [0x40406b] = 0x4031d2    #  ==
  [0x404073] = 0x403287
  [0x40407b] = 0x403378    #  ==

python 复制代码

#   						rsi 作为基地址0x40405b，[rsi]= 0x403019
mov rax,rdi
and rax,0xfffffffffffffffc  #  最低两位全0；作用，清空最低两位数据
                            
                            #  若 rdi == 0~3，得到0，执行跳转
je nomal                    #  je指令，隐式比较 rax 与 0，若 rdi>=4，不执行跳转
nop

jmp [rsi+32]        # >=4:  0x40407b-0x40405b == 0x20
nop

nomal:				# 0:    0x40405b-0x40405b == 0x0 == 0x0*8
jmp [rsi+rdi*8]     # 1:    0x404073-0x40405b == 0x8 == 0x1*8
nop                 # 2:    0x40405b-0x40405b == 0x10 == 0x2*8
                    # 3:    0x404073-0x40405b == 0x18 == 0x3*8

bash 复制代码

As an example, a for-loop can be used to compute the sum of the numbers 1 to n:
  sum = 0
  i = 1
  while i <= n:
    sum += i
    i += 1

Please compute the average of n consecutive quad words, where:
  rdi = memory address of the 1st quad word
  rsi = n (amount to loop for)
  rax = average computed

We will now set the following in preparation for your code:
  [0x4042f0:0x404498] = {n qwords]}
  rdi = 0x4042f0
  rsi = 53

bash 复制代码

xor rax,rax
mov rbx,0
one:
  cmp rbx,rsi
  jle loop
  nop
jmp done
nop
loop:
  add rax,[rdi+rbx*0x8]
  add rbx,0x1
  jmp one
done:
  div rsi

bash 复制代码

As an example, say we had a location in memory with adjacent numbers and we wanted
to get the average of all the numbers until we find one bigger or equal to 0xff:
  average = 0
  i = 0
  while x[i] < 0xff:
    average += x[i]
    i += 1
  average /= i

Using the above knowledge, please perform the following:
  Count the consecutive non-zero bytes in a contiguous region of memory, where:
    rdi = memory address of the 1st byte
    rax = number of consecutive non-zero bytes

Additionally, if rdi = 0, then set rax = 0 (we will check)!

An example test-case, let:
  rdi = 0x1000
  [0x1000] = 0x41
  [0x1001] = 0x42
  [0x1002] = 0x43
  [0x1003] = 0x00

then: rax = 3 should be set

We will now run multiple tests on your code, here is an example run:
  (data) [0x404000] = {10 random bytes},
  rdi = 0x404000

bash 复制代码

xor rax,rax   	# 校验 rdi 是否为0【可能不必要】
cmp rdi,0x0             
je done
nop

mov rbx,0x0
loop:
  mov rcx,[rdi+rbx]
  cmp rcx,0x0
  je done
  nop
  add rax,0x1
  add rbx,0x1
  jmp loop
  nop

done:
  and rax,rax

11. 程序栈调用：level 29~level 30

level 29（⭐）

bash 复制代码

Functions use the instructions "call" and "ret".

The "call" instruction pushes the memory address of the next instruction onto
the stack and then jumps to the value stored in the first argument.

Let's use the following instructions as an example:
  0x1021 mov rax, 0x400000
  0x1028 call rax
  0x102a mov [rsi], rax

1. call pushes 0x102a, the address of the next instruction, onto the stack.
2. call jumps to 0x400000, the value stored in rax.

The "ret" instruction is the opposite of "call".

ret pops the top value off of the stack and jumps to it.

Let's use the following instructions and stack as an example:

                              Stack ADDR  VALUE
  0x103f mov rax, rdx         RSP + 0x8   0xdeadbeef
  0x1042 ret                  RSP + 0x0   0x0000102a

Here, ret will jump to 0x102a

Please implement the following logic:
  str_lower(src_addr):
    i = 0
    if src_addr != 0:
      while [src_addr] != 0x00:
        if [src_addr] <= 0x5a:
          [src_addr] = foo([src_addr])
          i += 1
        src_addr += 1
    return i

foo is provided at 0x403000.
foo takes a single argument as a value and returns a value.

All functions (foo and str_lower) must follow the Linux amd64 calling convention (also known as System V AMD64 ABI):
  https://en.wikipedia.org/wiki/X86_calling_conventions#System_V_AMD64_ABI

Therefore, your function str_lower should look for src_addr in rdi and place the function return in rax.

An important note is that src_addr is an address in memory (where the string is located) and [src_addr] refers to the byte that exists at src_addr.

Therefore, the function foo accepts a byte as its first argument and returns a byte.

We will now run multiple tests on your code, here is an example run:
  (data) [0x404000] = {10 random bytes},
  rdi = 0x404000

python 复制代码

mov rax,0
mov rsi,rdi
cmp rsi,0
je done
nop

loop:
mov bl,[rsi]
cmp bl,0
je done
nop
cmp bl,90
ja next
nop
mov dil,bl		;将地址发给rdi，因为foo函数的参数从rdi引用
mov rdx,rax		;调用函数前保存rax的值
mov rcx,0x403000
call rcx
mov [rsi],al
mov rax,rdx		;使用之后再赋回rax的值，当然也可以直接使用其他寄存器，在函数最后返回时赋给rax
add rax,1
next:
add rsi,1
jmp loop
nop
done:
ret

level 30（⭐）

bash 复制代码

push 0
mov rbp,rsp

mov rax,-1
sub rsi,1
sub rsp,rsi
loop1:
        add rax,1
        cmp rax,rsi
        jg next
        nop
        mov rcx,0
        mov cl,[rdi+rax]
        mov r11,rbp
        sub r11,rcx
        mov dl,[r11]
        add dl,1
        mov [r11],dl
        jmp loop1
        nop

next:
mov rax,0
mov rbx,rax
mov rcx,rax
mov ax,-1
loop2:
        add ax,1
        cmp ax,0xff
        jg return
        nop
        mov r11,rbp
        sub r11,rax
        mov dl,[r11]
        cmp dl,bl
        jle loop2
        nop
        mov bl,dl
        mov cl,al
        jmp loop2
        nop

return:
mov rax,rcx
mov rsp,rbp
pop rbx
ret

参考wp

pwncollege通关笔记：3.Assembly Refresher(从0开始学习pwn) - FreeBuf网络安全行业门户
 Assembly Crash Course | Write-ups

Assembly Crash Course

前言

1. 寄存器赋值： level 1~level 2

2. 加减乘除（模）：level 3~level 6

3. 通用数据寄存器高低位：level 7 ~ level 8

4. 逻辑运算：level 9 ~ level 11

5. 内存<-->寄存器操作 ：level 12~level 14

6. 内存操作：level 15~level 16

7. 寄存器地址赋值：level 17~level 18

8. 栈调用：level 19~level 21

9. 程序跳转指令： level 22 ~ level 24

10. 分支循环语句： level 25~level 28

level 26：基地址跳转，指令优化 (⭐)

11. 程序栈调用：level 29~level 30

level 29（⭐）

level 30（⭐）

参考wp

5. 内存<-->寄存器操作：level 12~level 14