#if defined(__x86_64) || defined(__i386)

static inline void mfence (void) {
    __asm__ volatile ("mfence":::"memory");
}
static inline void rfence (void) {
    __asm__ volatile ("rfence":::"memory");
}
static inline void sfence (void) {
    __asm__ volatile ("sfence":::"memory");
}

/* According to the Intel Architecture Software Developer's
 * Manual, Volume 3: System Programming Guide
 * (http://www.intel.com/design/pro/manuals/243192.htm), page 7-6,
 * "For the P6 family processors, locked operations serialize all
 * outstanding load and store operations (that is, wait for them to
 * complete)."
 *
 * Bradley found that fence instructions is faster on an opteron
	 *   mfence takes 8ns on a 1.5GHZ AMD64 (maybe this is an 801)
	 *   sfence takes 5ns
	 *   lfence takes 3ns
	 *   xchgl  takes 14ns
 */

static inline lock_xchgl(volatile int *ptr, int x)
{
    __asm__("xchgl %0,%1" :"=r" (x) :"m" (*(ptr)), "0" (x) :"memory");
    return x;
}

#endif

typedef volatile int SPINLOCK[1];

static inline void spin_init (SPINLOCK v) {
    v[0] = 0;
    mfence();
}

static inline void spin_lock (SPINLOCK v) {
    while (lock_xchgl((int*)v, 1)!=0) {
	while (v[0]); /* Spin using only reads.  It would be better to use MCS locks, but this reduces bus traffic. */
    }
}
static inline void spin_unlock (SPINLOCK v) {
    sfence(); // Want all previous stores to take place before we unlock.
    v[0]=0;
}

#else
#error Need to define architectur-specific stuff for other machines.
#endif