// libc-free x86-64 Linux multi-threading example // $ cc -nostdlib stack_head.c // Ref: https://nullprogram.com/blog/2023/03/23/ // This is free and unencumbered software released into the public domain. // #include "clone_args.h" #define SYS_write 1 #define SYS_mmap 9 #define SYS_nanosleep 35 #define SYS_clone 56 #define SYS_exit 60 #define SYS_waitid 247 #define SYS_exit_group 231 #define SYSCALL1(n, a) \ syscall6(n,(long)(a),0,0,0,0,0) #define SYSCALL2(n, a, b) \ syscall6(n,(long)(a),(long)(b),0,0,0,0) #define SYSCALL3(n, a, b, c) \ syscall6(n,(long)(a),(long)(b),(long)(c),0,0,0) #define SYSCALL4(n, a, b, c, d) \ syscall6(n,(long)(a),(long)(b),(long)(c),(long)(d),0,0) #define SYSCALL5(n, a, b, c, d, e) \ syscall6(n,(long)(a),(long)(b),(long)(c),(long)(d),(long)(e),0) #define SYSCALL6(n, a, b, c, d, e, f) \ syscall6(n,(long)(a),(long)(b),(long)(c),(long)(d),(long)(e),(long)(f)) static long syscall6(long n, long a, long b, long c, long d, long e, long f) { register long ret; register long r10 asm("r10") = d; register long r8 asm("r8") = e; register long r9 asm("r9") = f; __asm volatile ( "syscall" : "=a"(ret) : "a"(n), "D"(a), "S"(b), "d"(c), "r"(r10), "r"(r8), "r"(r9) : "rcx", "r11", "memory" ); return ret; } static void millisleep(int ms) { long ts[] = {ms/1000, ms%1000 * 1000000L}; SYSCALL2(SYS_nanosleep, ts, ts); } /* int num = 65; */ // Ignore this, used for my own personal test static long fullwrite(int fd, void *buf, long len) { for (long off = 0; off < len;) { long r = SYSCALL3(SYS_write, fd, buf+off, len-off); if (r < 0) { return r; } off += r; } return len; } __attribute((noreturn)) static void exit(int status) { SYSCALL1(SYS_exit, status); __builtin_unreachable(); } __attribute((noreturn)) static void exit_group(int status) { SYSCALL1(SYS_exit_group, status); __builtin_unreachable(); } long sys_waitid() { struct siginfo s; return SYSCALL4(SYS_waitid, 0 /* P_ALL */, 0, &s, 0x40000000 /* __WALL */); } /* The structure must have a 16-byte alignment on all architectures */ struct __attribute((aligned(16))) stack_head { void (*entry)(struct stack_head*); // The entry point pointer. Will receive a pointer to its own stack_head /* The rest of the arguments can be filled with any thread-local data we want. The following is just an example! */ char *message; long message_length; int print_count; }; typedef struct stack_head stack_head; __attribute((naked)) static long newthread(CloneArgs* args) { __asm volatile ( /* "args" is already in "rdi" */ "mov $88, %%esi\n" // arg2 = size (always "88" until further notice) "mov $435, %%eax\n" // SYS_clone3 "syscall\n" "mov %%rsp, %%rdi\n" // entry point argument "ret\n" : : : "rax", "rcx", "rsi", "rdi", "r11", "memory" ); } static void threadentry(stack_head *stack) { char *message = stack->message; int length = stack->message_length; int count = stack->print_count; for (int i = 0; i < count; i++) { fullwrite(1, message, length); millisleep(25); } exit(0); } static stack_head *newstack(long size) { unsigned long p = SYSCALL6(SYS_mmap, 0, size, 3, 0x22, -1, 0); if (p > -4096UL) { return 0; } long count = size / sizeof(stack_head); return (stack_head*)p + count - 1; } __attribute((force_align_arg_pointer)) void _start(void) { stack_head* stack = newstack(1<<16); stack->entry = threadentry; // Thread data stack->message = "hello world\n"; stack->message_length = 12; stack->print_count = 20; CloneArgs args = { CLONE_VM, 0, 0, 0, SIGCHLD /* I have also tried using "0" here */, (__aligned_u64)stack, /* stack */ 1 << 16, /* stack_size */ 0, 0, 0, 0 }; newthread(&args); /* Fails */ if (sys_waitid() != 0) { exit_group(10); }; // Try to sleep and it indeed sleeps but the thread doesn't write the message /* millisleep(2000); */ exit_group(0); }