hack.lu CTF - Challenge 12 WriteUp

Mon 29 November 2010 by jean

Here is the missing Hack.lu CTF write-up for the "seamonster" challenge. It was a Windows reverse engineering challenge, with a nice anti-debugging trick.

The challenge objective is to give "Ring3" the correct password to keep our ship afloat and get the gold ! Let's have a look at the main function :

char Buf[32];   //holds the users input

int wmain(int argc, wchar_t* argv[])
    DWORD pid_1, dwProcessId;
    PROCESS_INFORMATION ProcessInformation;
    STARTUPINFOW StartupInfo;
    WCHAR commandLineParams[16];

    if ( argc == 1 )    //first launch
        pid_1 = GetCurrentProcessId();
        wsprintfW(commandLineParams, L"%d %d", 1, pid_1);

        std::basic_string<WCHAR> s;
        s += argv[0];
        s += L" ";
        s += commandLineParams;

        memset(&StartupInfo, 0, sizeof(StartupInfo));

        //seamonster.exe 1 <PID_1>
        CreateProcessW(argv[0], (LPWSTR) s.c_str(), 0, 0, 0,
                DEBUG_ONLY_THIS_PROCESS, 0, 0, &StartupInfo, &ProcessInformation);
        LFSR1_runloop();    //sub_401F80()  (noreturn)
    if ( argc == 3 )    //second and third processes
        if ( argv[1][0] == '1' )
            wsprintfW(commandLineParams, L"%d %s", 2, argv[2]);

            std::basic_string<WCHAR> s;
            s += argv[0];
            s += L" ";
            s += commandLineParams;

            memset(&StartupInfo, 0, sizeof(StartupInfo));

            //seamonster.exe 2 <PID_1>
            CreateProcessW(argv[0], (LPWSTR)s.c_str(), 0, 0, 0,
                        DEBUG_ONLY_THIS_PROCESS, 0, 0, &StartupInfo, &ProcessInformation);
            LFSR2_runloop();    //sub_401FC0() (noreturn)
        printf(">>> ");
        gets_s(Buf, 32);    //get user input

        swscanf(argv[2], L"%d", &dwProcessId);   //get <PID_1> from command line
        if ( DebugActiveProcess(dwProcessId) == 1 ) //attach 3rd process to 1st
    return 0;

The anti-debug technique used is interesting : 3 processes are spawned and debug each other in a circular fashion (1->2->3->1), hence preventing another ring3 debugger to attach. Furthermore, each process manages a 128 bit linear feedback shift register (LFSR), and the feedback bits are exchanged through debug interrupts : the debugger process injects the bit value in the eax register of the debuggee using SetThreadContext. Therefore, removing the debugging code makes the binary non functional. The keystream generated by the LFSRs is used by the third process to decrypt two functions, which we will call stage1 and stage2. The following C code shows the `` process3_main`` function :

void process3_main()    //0x402080
    //init LFSR function pointers
    ptr_f1 = &LFSR3_f1_output;
    ptr_f2 = &LFSR3_f2_feedback;
    ptr_f3 = &LFSR3_f3_update;
    ptr_setState = &LFSR3_setState;
    ptr_xor_string = &LFSR3_xor_string;
    LFSR3_newState = "FluxFingers";

    //wait debug attach to process 1

    //tell parent processes to reset LFSR state
        push eax
        mov eax,0
        int 3
        pop eax
    //wait for the interrupt to loop back to us
    while ( WaitForDebugAndSetEAX() != 1 );

    //init LFSR3 state with LFSR3_newState

    //drop first 50 keystream bytes
    for(i=0; i < 50; i++)

    //decrypt first encrypted function
    for(i=0; i < 384; i++)      //loc_4020F0
        *(&stage1 + i) ^= LFSR3_getByte();

Instead of reversing the LFSR algorithm, we went the easy way and patched the binary to dump the keystream by adding some calls to printf. This solution is probably not the most elegant but allowed us to quickly solve this challenge. For instance in the previous function, we simply patched the second loop to display each byte of the key by jumping to an existing call to printf and setting the format string to "%02x". The following "bindiff" shows the idea :

 call    sub_402000     ; LFSR3_getByte ()
 mov     ecx, offset loc_402430  ; stage1
-xor     [ecx+esi], al
-add     ecx, esi
+and     eax, 0FFh
+call    sub_4021D0
 inc     esi
 cmp     esi, 384
 jl      short loc_4020F0
 call    loc_402430     ; stage1()


-push    0
+push    eax
-push    offset aRing3T_t_h_s_0 ; "Ring3 T.T.H.S.M.> Mrrrmmmmmpffff!\nRing3"...
+push    offset a02x     ; "%02x"
 call    ds:printf
 add     esp, 8

Once we run the patched binary, the keystream is displayed and we can then decrypt the stage1 function and disassemble it to get the following code :

void stage1()
    struct MD5_CTX md5_context;
    char stage2key[214];
    char* heapBuffer;
    void (*stage2ptr)();

    MD5_Init(&md5_context); //inlined
    //hash the whole .text section except for the encrypted stage 2 at 0x4025B0
    MD5_Update(0x15B0, &md5_context, 0x401000); //first parameter is data size
    MD5_Update(0x97A, &md5_context, 0x402686);

    //trigger LFSR state reset in parent processes
        push eax
        mov eax,0
        int 3
        pop eax
    //wait for the interrupt to loop back to us
    while ( WaitForDebugAndSetEAX() != 1 );

    //use integrity check output to reinitialize LFSR
    LFSR3_newState = &md5_context->digest;

    //drop first 60 bytes of keystream
    for(i=0; i < 60; i++)

    HANDLE heap = HeapCreate(HEAP_CREATE_ENABLE_EXECUTE, 0, 0);

    srand((int) heap);
    int randomIndex = rand() % 1000;

    for(i=0; i < 214; i++)
        //we'll put our second "printf patch" here to get the key
        stage2key[i] = LFSR3_getByte();

    for(i=0 ; i < 1000; i++)
        heapBuffer = HeapAlloc(heap, 0, 214);
        if ( i == randomIndex )
            //copy encrypted stage 2 to heap memory
            memcpy(heapBuffer, 0x4025B0, 214);
            stage2ptr = heapBuffer;

        for(i=0; i < 107; i+=2)
            heapBuffer[i] ^= stage2key[i];
            heapBuffer[i+1] ^= stage2key[i+1];
    //call decrypted stage2 on heap

This function allocates 1000 buffers and run the decryption code on each, but only one of them (randomly chosen) will be initialized with the encrypted stage2. If we do the same as before and patch the loop which gets the keystream, the decrypted code ends up being incorrect. In fact, the MD5 digest is used to reinitialize the LFSR, so I had to patch the MD5_final function to always return the right value (140e351638717e04e73ff8ec5e2a81db). Once this is done we can decrypt and reverse the last stage :

void stage2(void* stage2heapBuffer)
    DWORD stage2_opcodes[3];
    char good_passphrase[32];

    stage2_opcodes[0] = ((DWORD*)stage2heapBuffer)[0];
    stage2_opcodes[1] = ((DWORD*)stage2heapBuffer)[1];
    stage2_opcodes[2] = ((DWORD*)stage2heapBuffer)[2];

    //reset LFSR state with first stage2 bytes
    LFSR3_newState = &stage2_opcodes[0];

    memcpy(good_passphrase, encrypted_passphrase, 32);
    //xor encrypted passphrase with keystream
    (*ptr_xor_string)(good_passphrase, 31);

    //insert printf here :)
    int passOk = !strcmp(Buf, good_passphrase);

    memset(good_passphrase, 0, 32);

    //erase current function and jump to display_win/fail (ROP)
    //memset(stage2heapBuffer, 0, 214);

        push 0
        push 214
        push 0
        push stage2heapBuffer
    if ( passOk )
        __asm{ push offset display_win}
        __asm{ push offset display_fail}

        push offset memset

The LFSR state is reset again, this time using the first bytes of the stage2 function, and then the correct passphrase is decrypted (it is stored in the .data section) and compared with the user's input. One last "printf patch" at the right spot and we finally get the right answer : "Ring3?! Lol! Joanna did Ring-1!".

After the CTF was over I took some time to fully reverse the LFSR algorithm, and the following Python script reproduces the seamonster functionality to decrypt the two stages and the passphrase.

import hashlib, struct

def xor_string(data_string, key_int):
    res = ""
    for i in xrange(len(data_string)):
        res += chr( ord(data_string[i]) ^ key_int[i % len(key_int)])
    return res

def getbit(x, pos):
    return (x >> pos) & 1

def insertHighBit(x, b):
    return (x >> 1) | (b << 127)

class LFSR_process1:
    def __init__(self):
        self.state = 0xBBCCDDEEFF0011223344000000000000

    def f1_output(self):
        return  #not used

    def f2_feedback(self):
        return (getbit(self.state,108) | getbit(self.state,44)) \
            ^ (getbit(self.state,123) | getbit(self.state,59)) \
            ^ (getbit(self.state,109) | getbit(self.state,45)) \
            & (getbit(self.state,110) | getbit(self.state,46))

    def f3_update(self, feedback):
        newbit = feedback ^ (getbit(self.state,51) | getbit(self.state,114))
        self.state = insertHighBit(self.state, newbit)

class LFSR_process2:
    def __init__(self):
        self.state = 0x000000000000000000000000000e0000

    def f1_output(self):
        return    #not used

    def f2_feedback(self):
        return (getbit(self.state,81) | getbit(self.state,17)) \
            ^ (getbit(self.state,126) | getbit(self.state,62)) \
            ^ (getbit(self.state,82) | getbit(self.state,18)) \
            & (getbit(self.state,83) | getbit(self.state,19))

    def f3_update(self, feedback):
        newbit = feedback ^ (getbit(self.state,42) | getbit(self.state,105))
        self.state = insertHighBit(self.state, newbit)

class LFSR_process3:
    def __init__(self,p1, p2):
        self.state = 0
        self.p1 = p1
        self.p2 = p2

    def setState(self, str):
        str = str[:10]  #only use 10 bytes
        self.state = 0
        self.state |= struct.unpack(">L", str[:4])[0] << 96
        self.state |= struct.unpack(">L", str[4:8])[0] << 64
        self.state |= struct.unpack(">H", str[8:10])[0] << 48

    def f1_output(self):
        return (getbit(self.state, 99) | getbit(self.state,35)) \
             ^ (getbit(self.state,126) | getbit(self.state,62))

    def f2_feedback(self):
        return (getbit(self.state,99) | getbit(self.state,35)) \
            ^ (getbit(self.state,126) | getbit(self.state,62)) \
            ^ (getbit(self.state,100) | getbit(self.state,36)) \
            & (getbit(self.state,101) | getbit(self.state,37))

    def f3_update(self, feedback):
        newbit = feedback ^ (getbit(self.state,60) | getbit(self.state,123))
        self.state = insertHighBit(self.state, newbit)

    def getByte(self):
        x = 0
        for i in xrange(8):
            x |= self.f1_output() << (7-i)
            feedback = self.p2.f2_feedback()
        return x

    def getBytes(self, n):
        return [self.getByte() for i in xrange(n)]


encrypted_stage1 = binary[0x1830:0x1830+384]
encrypted_stage2 = binary[0x19b0:0x19b0+214]
encrypted_password = binary[0x2544:0x2544+31]

lfsr3 = LFSR_process3(LFSR_process1(), LFSR_process2())

#Drop first 50 bytes of keystream
drop = lfsr3.getBytes(50)

stage1key = lfsr3.getBytes(384)
stage1 = xor_string(encrypted_stage1, stage1key)

binary_with_stage1 = binary.replace(encrypted_stage1, stage1)

#stage1 operations

md5sum = hashlib.md5()

print "MD5 integrity check :", md5sum.hexdigest()
#reset all states (int3 eax=1)
lfsr3 = LFSR_process3(LFSR_process1(), LFSR_process2())
#use md5 for process3 state

#Drop first 60 bytes of keystream

stage2key = lfsr3.getBytes(214)
stage2 = xor_string(encrypted_stage2, stage2key)

#stage2 operations
#reset LFSR3 state with decrypted stage2 instructions

passkey = lfsr3.getBytes(len(encrypted_password))

#Ring3?! Lol! Joanna did Ring-1!
print "Decrypted password :", xor_string(encrypted_password, passkey)

That's it for the seamonster, thanks again to FluxFingers for this great CTF.