int in, out, buffer[BUFFERSIZE];
mutex_t m;
cond_var_t not_empty, not_full;
while(more_to_produce){
mutex_lock(&m);
if (out == (in + 1) % BUFFERSIZE) // buffer full
condition_wait(¬_full);
add_item(buffer[in]);
in = (in + 1) % BUFFERSIZE
cond_broadcast(¬_empty);
}
while (more_to_consume){
mutex_lock(&m);
if(out == in) // buffer empty
condtion_wait(¬_empty);
remove_item(out);
out = (out + 1) % BUFFERSIZE;
condtion_signal(¬_empty);
}
Experimental Results
Best Case Numbers
Synthetic load:
-N requests for same file
=> best case
Measure Bandwidth
-BW = N * bytes(F)/time
-File size 0-200kb
=> vary work per request
Observations:
-all exhibit similar results
-SPED has best performance
– Flash AMPED extra check for memory presence
– Zeus has anomaly
When data is in cache:
– SPED >> Amped Flash
– SPED and AMPED Flash >> HT/MP
With disk-bound workload
– AMPED FLASH >> SPED
Flash web server
Flash: Event-Driven web server
– an event driven webserver
– with asymmetric helper processes
Handlers, Event Dipatcher, Helpers
Flash: Addtional Optimizations
Start, accept conn, read request, parse request, find file, computer header, send hader, read file send data, End
Apache web server
core = basic server skelethon
module -> Apache core -> request・response
Setting up performance comparissons
-mp (each process single thread)
-mt(boss-worser)
-single process event-driven(sped)
-leus(sped w/2 processes)
-apache(vl31, MP)
compare against Flash(AMPED model)
Realistic request workload
=> distribution of web page accesses over time
Controlled, reproducible workload
=> trace-based (from real web servers)
CS web server trace, owlnet trace, synthetic workload
Bandwidth == bytes/time
=> total bytes xfered from files/ total time
Connection Rate == request/time
=> total client conn / total time
Tasks in Linux
task creation: clone
clone(function, stack_ptr, sharing_flags, args)
sharing_flags
CLINE_VM, create a new thread
CLONE_FS, share unmask, root, and working dirs
CLoE_FILES, share the file descriptiors
CLONE_SIGHAND, share the signal handler table
CLONE_PID, new thread gets old PID
CLINE_PARENT, new thread has same parent as caller
Native POSIX Thread Library(NPTL)
– kernel see each UTL info
– kernel traps are cheaper
– more resources: memory, large range of IDs…
Older LinuxTreads “H-H model”
– similar issues to thoese described in Solaris papers
Thread Performance Considerations
– performance comparisons
– multi-process vs. multi-threaded vs. event-driven
– Event-driven architectures
– “Flash: An Efficient and Portable Web server” vs Apache
Boss-worker “Ooms per toy order”
Pipeline “20ms per pipeline stage”
execution time avg time to complete order
Are threads useful?
threads are useful because of..
parallelization => speed up
specialization => hot cache!
efficiency => lower memory requirement and cheaper
For a matrix multiply application
=> execution time
-> depend on metrics
For a web service application …
=> number of client requests / time
=> response time
-> average, max, min, 95%
For hardware …
=> higher utilization (e.g. CPU)
Performance Metrics
metrics == a measurement standard
-measurable and/or quantifiable property
Thread Design Considerations
dynamic thread creation is expensive
Dynamic decision
– if handler doesn’t lock => execute on interrupted threads stack
– if handler can block => turn into real thread
Optimization
– precreate z prenitialize thread structures for interrupt routines
Threads and Signal Handling
– disable => clear signal
– signal occurs – what to do with the signal?
SIGNAL-N, handler-N-start-addr
user, kernel mask
Optimize Common Case
– signals less frequent than signal mask updates
– system calls avoided cheaper to update UL mask
Task struct in Linux
– main execution abstraction => task
Struct task_struct {
// ...
pid_t pid;
pid_t tgid;
int prio;
volatile long state;
struct mm_struct **mm;
struct files_struct *files;
struct list_head tasks;
int on_cpu;
cpumask_t cpus_allowed;
// ...
}
Producer and Consumer
void *producer (void *param){
int i;
for (i = 1; i <= 20; i++){
pthread_mutex_lock (&m);
if(num > BUF_SIZE){ /* overflow */
exit(1);
}
while (num == BUF_SIZE){
pthread_cond_wait (&_prod, &m)
}
buffer[add] = i;
add = (add + 1) % BUF_SIZE;
num++;
pthread_mutex_unlock (&m);
pthread_cond_signal (&c_cons);
printf ("producer: inserted %d\n", i); fflush(stdout);
}
printf ("producer quiting\n"); fflush(stdout);
return 0;
}
Variable Safety Tips
– Do not forget to notify waiting threads!
– predicate change =>
signal/broadcast correct condition variable
– When in doblt broadcast
– but performance loss
– You do not need a mutex to signal/broadcast
#include <stdio.h> #include <stdlib.h> #include <pthread.h> #define BUF_SIZE 3 /* size of shared buffer */ int buffer[BUF_SIZE]; int add = 0; int rem = 0; int num = 0; pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER; pthread_cond_t c_cons = PTHREAD_COND_INITIALIZER; pthread_cond_t c_prod = PTHREAD_COND_INITIALIZER; void *producer(void *param); void *consumer(void *param);
#include <stdio.h>
#include <stdlib.h>
#include <pthread.h>
#define BUF_SIZE 3 /* size of shared buffer */
int buffer[BUF_SIZE];
int add = 0;
int rem = 0;
int num = 0;
pthread_mutex_t m = PTHREAD_MUTEX_INITIALIZER;
pthread_cond_t c_cons = PTHREAD_COND_INITIALIZER;
pthread_cond_t c_prod = PTHREAD_COND_INITIALIZER;
void *producer(void *param);
void *consumer(void *param);
int main(int argc, char *argv[]){
pthread_t tid1, tid2;
int i;
if (pthread_create(&tid1, NULL, producer, NULL) != 0){
fprintf(stderr, "Unable to create producer thread\n");
exit(1);
}
if (pthread_create(&tid2, NULL, consumer, NULL) != 0){
fprintf(stderr, "Unable to create consumer thread\n");
exit(1);
}
pthread_join(tid1, NULL); /* wait for producer to exit */
pthread_join(tid2, NULL); /* wait for consumer to exit */
printf("Parent quiting\n");
}
Pthread Mutexes
“to solve mutual exclusion problems among concurrent threads”
Birrell’s mechanisms: Pthreads:
-mutex
pthread_mutex_t aMutex; // mutex type
-Lock(mutex)
// explicit lock
int pthread_mutex_lock(pthread_mutex_t
*mutex);
// explicit unlock
int pthread_mutex_unlock(pthread_mutex_t
*mutex);
Other mutex operation
int pthread_mutex_init(pthread_mutex_t *mutex, const pthread_mutexattr_t *attr); // mutex attributes == specifies mutex behavior when // a mutex is shared among processes
int pthread_mutex_trylock(pthread_mutex_t *mutex);
-shared data should always be accessed through a single mutex
-mutex scope must be visible to all
-globally order lock
for all threads, lock mutexes in order
-always unlock a mutex
always unlock the correct mutex
Pthread Condition Variables
-condition
pthread_cond_t aCond; // type of cond variable
-wait
int pthread_cond_wait(pthread_cond_t *cond, pthread_mutex_t *mutex)
-signal
int pthread_cond_signal(pthread_cond_t *cond);
-broadcast
int pthread_cond_broadcast(pthread_cond_t *cond);
int pthread_cond_init(pthread_cond_t *cond, const pthread_condattr_t *attr); // attributes --e.g., if it's shared int pthread_cond_destory(pthread_cond_t *cond);
PThread Creation example2
#include#include #define NUM_THREADS 4 void *threadFunc (void *arg){ int *p = (int*)pArg; int myNum = *p; printf("Thread number %d\n", myNum); return 0; } int main(void){ int i; pthread_t tid[NUM_THREADS]; for (i=0; i < NUM_THREADS; i++){ /* create/fork threads */ pthread_create(&tid[i], NULL, threadFunc, &i); } for (i=0; i < NUM_THREADS; i++){ pthread_join(tid[i], NULL); } return 0; }
PThread Creation
#include <stdio.h>
#include <pthread.h>
void *foo (void *arg){
printf("Foobar!\n");
pthread_exit(NULL);
}
int main(void){
int i;
pthread_t tid;
pthread_attr_t attr;
pthread_attr_init(&attr);
pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED);
pthread_attr_setscope(&attr, PTHREAD_SCOPE_SYSYEM);
pthread_create(NULL, &attr, foo, NULL);
return 0;
}
Compiling PThreads
1. #include
2. compile source with -lpthread or -pthread
Intro to OS ~ ==> gcc -o main main.c -lpthread
3. check return values of common functions
#include#include #define NUM_THREADS 4 void *hello (void *arg){ printf("Hello Thread\n"); return 0; } int main(void){ int i; pthread_t tid[NUM_THREADS]; for (i=0; i < NUM_THREADS; i++){ /* create/fork threads */ pthread_create(&tid[i], NULL, hello, NULL); } for (i=0; i < NUM_THREADS; i++){ pthread_join(tid[i], NULL); } return 0; }