stack_head_clone3_futex.c
· 4.4 KiB · C
Raw
// libc-free x86-64 Linux multi-threading example
// $ cc -nostdlib stack_head.c
// Ref: https://nullprogram.com/blog/2023/03/23/
// This is free and unencumbered software released into the public domain.
#include "clone_args.h"
#define SYS_write 1
#define SYS_mmap 9
#define SYS_nanosleep 35
#define SYS_clone 56
#define SYS_exit 60
#define SYS_futex 202
#define SYS_exit_group 231
#define FUTEX_WAIT 0
#define FUTEX_WAKE 1
#define SYSCALL1(n, a) \
syscall6(n,(long)(a),0,0,0,0,0)
#define SYSCALL2(n, a, b) \
syscall6(n,(long)(a),(long)(b),0,0,0,0)
#define SYSCALL3(n, a, b, c) \
syscall6(n,(long)(a),(long)(b),(long)(c),0,0,0)
#define SYSCALL4(n, a, b, c, d) \
syscall6(n,(long)(a),(long)(b),(long)(c),(long)(d),0,0)
#define SYSCALL5(n, a, b, c, d, e) \
syscall6(n,(long)(a),(long)(b),(long)(c),(long)(d),(long)(e),0)
#define SYSCALL6(n, a, b, c, d, e, f) \
syscall6(n,(long)(a),(long)(b),(long)(c),(long)(d),(long)(e),(long)(f))
static long syscall6(long n, long a, long b, long c, long d, long e, long f) {
register long ret;
register long r10 asm("r10") = d;
register long r8 asm("r8") = e;
register long r9 asm("r9") = f;
__asm volatile (
"syscall"
: "=a"(ret)
: "a"(n), "D"(a), "S"(b), "d"(c), "r"(r10), "r"(r8), "r"(r9)
: "rcx", "r11", "memory"
);
return ret;
}
static void millisleep(int ms) {
long ts[] = {ms/1000, ms%1000 * 1000000L};
SYSCALL2(SYS_nanosleep, ts, ts);
}
/* int num = 65; */ // Ignore this, used for my own personal test
static long fullwrite(int fd, void *buf, long len) {
for (long off = 0; off < len;) {
long r = SYSCALL3(SYS_write, fd, buf+off, len-off);
if (r < 0) { return r; }
off += r;
}
return len;
}
__attribute((noreturn)) static void exit(int status) {
SYSCALL1(SYS_exit, status);
__builtin_unreachable();
}
__attribute((noreturn)) static void exit_group(int status) {
SYSCALL1(SYS_exit_group, status);
__builtin_unreachable();
}
static void futex_wait(int *futex, int expect) {
SYSCALL4(SYS_futex, futex, FUTEX_WAIT, expect, 0);
}
static void futex_wake(int *futex) {
SYSCALL3(SYS_futex, futex, FUTEX_WAKE, 0x7fffffff);
}
/* The structure must have a 16-byte alignment on all architectures */
struct __attribute((aligned(16))) stack_head {
void (*entry)(struct stack_head*); // The entry point pointer. Will receive a pointer to its own stack_head
/* The rest of the arguments can be filled with any thread-local
data we want. The following is just an example! */
char *message;
long message_length;
int print_count;
int join_futex; // Join "futex" to help us "wait" to "join" that thread later
};
typedef struct stack_head stack_head;
typedef struct {
void *tail;
stack_head *head;
} thread_stack;
__attribute((naked)) static long newthread(CloneArgs* args) {
__asm volatile (
/* "args" is already in "rdi" */
"mov $88, %%esi\n" // arg2 = size (always "88" until further notice)
"mov $435, %%eax\n" // SYS_clone
"syscall\n"
"mov %%rsp, %%rdi\n" // entry point argument
"ret\n"
: : : "rax", "rcx", "rsi", "rdi", "r11", "memory"
);
}
static void threadentry(stack_head *stack) {
char *message = stack->message;
int length = stack->message_length;
int count = stack->print_count;
for (int i = 0; i < count; i++) {
fullwrite(1, message, length);
millisleep(25);
}
__atomic_store_n(&stack->join_futex, 1, __ATOMIC_SEQ_CST);
futex_wake(&stack->join_futex);
exit(0);
}
static thread_stack newstack(long size) {
unsigned long p = SYSCALL6(SYS_mmap, 0, size, 3, 0x22, -1, 0);
if (p > -4096UL) { /* Return "null" on failure */
thread_stack stack = { (void*)0, (void*)0 };
return stack;
}
long count = size / sizeof(thread_stack);
thread_stack stack = { (void*)(p + count - 1), (void*)count };
return stack;
}
__attribute((force_align_arg_pointer)) void _start(void) {
thread_stack stack = newstack(1<<16);
stack.head->entry = threadentry;
// Thread data
stack.head->message = "hello world\n";
stack.head->message_length = 12;
stack.head->print_count = 20;
stack.head->join_futex = 0; // Set the "futex"
CloneArgs args = {
CLONE_VM,
0, 0, 0, SIGCHLD /* I have also tried using "0" here */,
(__aligned_u64)stack.tail, /* stack */
(void *)stack.head - stack.tail, /* stack_size */
0, 0, 0, 0
};
newthread(&args);
futex_wait(&stack.head->join_futex, 0);
exit_group(0);
}
1 | // libc-free x86-64 Linux multi-threading example |
2 | // $ cc -nostdlib stack_head.c |
3 | // Ref: https://nullprogram.com/blog/2023/03/23/ |
4 | // This is free and unencumbered software released into the public domain. |
5 | |
6 | #include "clone_args.h" |
7 | |
8 | #define SYS_write 1 |
9 | #define SYS_mmap 9 |
10 | #define SYS_nanosleep 35 |
11 | #define SYS_clone 56 |
12 | #define SYS_exit 60 |
13 | #define SYS_futex 202 |
14 | #define SYS_exit_group 231 |
15 | |
16 | #define FUTEX_WAIT 0 |
17 | #define FUTEX_WAKE 1 |
18 | |
19 | #define SYSCALL1(n, a) \ |
20 | syscall6(n,(long)(a),0,0,0,0,0) |
21 | #define SYSCALL2(n, a, b) \ |
22 | syscall6(n,(long)(a),(long)(b),0,0,0,0) |
23 | #define SYSCALL3(n, a, b, c) \ |
24 | syscall6(n,(long)(a),(long)(b),(long)(c),0,0,0) |
25 | #define SYSCALL4(n, a, b, c, d) \ |
26 | syscall6(n,(long)(a),(long)(b),(long)(c),(long)(d),0,0) |
27 | #define SYSCALL5(n, a, b, c, d, e) \ |
28 | syscall6(n,(long)(a),(long)(b),(long)(c),(long)(d),(long)(e),0) |
29 | #define SYSCALL6(n, a, b, c, d, e, f) \ |
30 | syscall6(n,(long)(a),(long)(b),(long)(c),(long)(d),(long)(e),(long)(f)) |
31 | |
32 | static long syscall6(long n, long a, long b, long c, long d, long e, long f) { |
33 | register long ret; |
34 | register long r10 asm("r10") = d; |
35 | register long r8 asm("r8") = e; |
36 | register long r9 asm("r9") = f; |
37 | __asm volatile ( |
38 | "syscall" |
39 | : "=a"(ret) |
40 | : "a"(n), "D"(a), "S"(b), "d"(c), "r"(r10), "r"(r8), "r"(r9) |
41 | : "rcx", "r11", "memory" |
42 | ); |
43 | return ret; |
44 | } |
45 | |
46 | static void millisleep(int ms) { |
47 | long ts[] = {ms/1000, ms%1000 * 1000000L}; |
48 | SYSCALL2(SYS_nanosleep, ts, ts); |
49 | } |
50 | |
51 | /* int num = 65; */ // Ignore this, used for my own personal test |
52 | |
53 | static long fullwrite(int fd, void *buf, long len) { |
54 | for (long off = 0; off < len;) { |
55 | long r = SYSCALL3(SYS_write, fd, buf+off, len-off); |
56 | if (r < 0) { return r; } |
57 | off += r; |
58 | } |
59 | |
60 | return len; |
61 | } |
62 | |
63 | __attribute((noreturn)) static void exit(int status) { |
64 | SYSCALL1(SYS_exit, status); |
65 | __builtin_unreachable(); |
66 | } |
67 | |
68 | __attribute((noreturn)) static void exit_group(int status) { |
69 | SYSCALL1(SYS_exit_group, status); |
70 | __builtin_unreachable(); |
71 | } |
72 | |
73 | static void futex_wait(int *futex, int expect) { |
74 | SYSCALL4(SYS_futex, futex, FUTEX_WAIT, expect, 0); |
75 | } |
76 | |
77 | static void futex_wake(int *futex) { |
78 | SYSCALL3(SYS_futex, futex, FUTEX_WAKE, 0x7fffffff); |
79 | } |
80 | |
81 | /* The structure must have a 16-byte alignment on all architectures */ |
82 | struct __attribute((aligned(16))) stack_head { |
83 | void (*entry)(struct stack_head*); // The entry point pointer. Will receive a pointer to its own stack_head |
84 | |
85 | /* The rest of the arguments can be filled with any thread-local |
86 | data we want. The following is just an example! */ |
87 | char *message; |
88 | long message_length; |
89 | int print_count; |
90 | int join_futex; // Join "futex" to help us "wait" to "join" that thread later |
91 | }; |
92 | |
93 | typedef struct stack_head stack_head; |
94 | |
95 | typedef struct { |
96 | void *tail; |
97 | stack_head *head; |
98 | } thread_stack; |
99 | |
100 | __attribute((naked)) static long newthread(CloneArgs* args) { |
101 | __asm volatile ( |
102 | /* "args" is already in "rdi" */ |
103 | "mov $88, %%esi\n" // arg2 = size (always "88" until further notice) |
104 | "mov $435, %%eax\n" // SYS_clone |
105 | "syscall\n" |
106 | "mov %%rsp, %%rdi\n" // entry point argument |
107 | "ret\n" |
108 | : : : "rax", "rcx", "rsi", "rdi", "r11", "memory" |
109 | ); |
110 | } |
111 | |
112 | static void threadentry(stack_head *stack) { |
113 | char *message = stack->message; |
114 | int length = stack->message_length; |
115 | int count = stack->print_count; |
116 | for (int i = 0; i < count; i++) { |
117 | fullwrite(1, message, length); |
118 | millisleep(25); |
119 | } |
120 | |
121 | __atomic_store_n(&stack->join_futex, 1, __ATOMIC_SEQ_CST); |
122 | futex_wake(&stack->join_futex); |
123 | exit(0); |
124 | } |
125 | |
126 | static thread_stack newstack(long size) { |
127 | unsigned long p = SYSCALL6(SYS_mmap, 0, size, 3, 0x22, -1, 0); |
128 | |
129 | if (p > -4096UL) { /* Return "null" on failure */ |
130 | thread_stack stack = { (void*)0, (void*)0 }; |
131 | return stack; |
132 | } |
133 | |
134 | long count = size / sizeof(thread_stack); |
135 | thread_stack stack = { (void*)(p + count - 1), (void*)count }; |
136 | return stack; |
137 | } |
138 | |
139 | __attribute((force_align_arg_pointer)) void _start(void) { |
140 | thread_stack stack = newstack(1<<16); |
141 | stack.head->entry = threadentry; |
142 | |
143 | // Thread data |
144 | stack.head->message = "hello world\n"; |
145 | stack.head->message_length = 12; |
146 | stack.head->print_count = 20; |
147 | |
148 | stack.head->join_futex = 0; // Set the "futex" |
149 | |
150 | CloneArgs args = { |
151 | CLONE_VM, |
152 | 0, 0, 0, SIGCHLD /* I have also tried using "0" here */, |
153 | (__aligned_u64)stack.tail, /* stack */ |
154 | (void *)stack.head - stack.tail, /* stack_size */ |
155 | 0, 0, 0, 0 |
156 | }; |
157 | |
158 | newthread(&args); |
159 | |
160 | futex_wait(&stack.head->join_futex, 0); |
161 | exit_group(0); |
162 | } |