This is the mail archive of the glibc-bugs@sourceware.org mailing list for the glibc project.


Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]
Other format: [Raw text]

[Bug libc/21305] New: clock_gettime(CLOCK_MONOTONIC_RAW) can and should use rdtsc instructions instead of entering kernel through VDSO


https://sourceware.org/bugzilla/show_bug.cgi?id=21305

            Bug ID: 21305
           Summary: clock_gettime(CLOCK_MONOTONIC_RAW) can and should use
                    rdtsc instructions instead of entering kernel through
                    VDSO
           Product: glibc
           Version: 2.26
            Status: UNCONFIRMED
          Severity: enhancement
          Priority: P1
         Component: libc
          Assignee: unassigned at sourceware dot org
          Reporter: jason.vas.dias at gmail dot com
                CC: drepper.fsp at gmail dot com
  Target Milestone: ---

On a modern Intel Haswell CPU :
 processor      : 6
vendor_id       : GenuineIntel
cpu family      : 6
model           : 60
model name      : Intel(R) Core(TM) i7-4910MQ CPU @ 2.90GHz
siblings        : 8
cpu cores       : 4

with TSC Related CPU Capabilities:
( cpuinfo flags | tr ' ' '\n' | grep tsc )
tsc
rdtscp
constant_tsc
nonstop_tsc
tsc_deadline_timer
tsc_adjust

This loop does not print any number < 600 (600ns) :

   struct timespec tsp1, tsp1;
   unsigned int i = 10;
   do
   {  clock_gettime(CLOCK_MONOTONIC_RAW, &tsp1);
      clock_gettime(CLOCL_MONOTONIC_RAW, &tsp2);
      printf("%llu\n",
       ( ((unsigned long long)tsp2.tv_sec * 1000000000)
        + tsp2.tv_nsec 
       )
      -( ((unsigned long long)tsp1.tc_sec * 1000000000)
        + tsp1.tv_nsec
       ) );
   }while( --i );


This is really bad, since a simple rdtsc / rdtscp instruction 
and conversion of its value using the same formula as
Linux's clocksource.c takes @ 15-20ns (see attached
ttsc.tar for proof of this). 

It is possible to access the VDSO's 

typedef struct vsyscall_gtod_data_s {
        unsigned seq;

        int vclock_mode;
        U64_t   cycle_last;
        U64_t   mask;
        U32_t   mult;
        U32_t   shift;

        /* open coded 'struct timespec' */
        U64_t           wall_time_snsec;
        gtod_long_t     wall_time_sec;
        gtod_long_t     monotonic_time_sec;
        U64_t           monotonic_time_snsec;
        gtod_long_t     wall_time_coarse_sec;
        gtod_long_t     wall_time_coarse_nsec;
        gtod_long_t     monotonic_time_coarse_sec;
        gtod_long_t     monotonic_time_coarse_nsec;

        int             tz_minuteswest;
        int             tz_dsttime;
} Linux_GTOD_t;

extern Linux_GTOD_t vvar_vsyscall_gtod_data ;


This contains the live calculated and adjusted 
  'shift' and 'mult' values 
used by the kernel to return the information 
in a 'struct timespec *ts' in response to the 
  clock_gettime(CLOCK_MONOTONIC_RAW, ts) call,
so a user space libary, eg. glibc or a libIA64TSC.so
'rdtsc' library, could do something like:
<quote><code><pre>
__thread
U32_t _ia64_tsc_user_cpu; /* TSC Aux value identifies CPU */

static inline __attribute__((always_inline))
U64_t
IA64_tsc_now()
{ if(!(    _ia64_invariant_tsc_enabled
      ||((!_ia64_tsc_info_initialized) &&
IA64_invariant_tsc_is_enabled(NULL,NULL))
      )
    )
  { fprintf(stderr, __FILE__":%d:(%s): must be called with invariant TSC
enabled.\n",__LINE__,__FUNCTION__);
    return 0;
  }
  U32_t tsc_hi, tsc_lo;
  register UL_t tsc;
  asm volatile
  ( "rdtscp\n\t"
    "mov %%edx, %0\n\t"
    "mov %%eax, %1\n\t"
    "mov %%ecx, %2\n\t"  
  : "=m" (tsc_hi) ,
    "=m" (tsc_lo) ,
    "=m" (_ia64_tsc_user_cpu) :  
  : "%rax","%rcx","%rdx"
  );
  tsc=(((UL_t)tsc_hi) << 32)|((UL_t)tsc_lo);
  return tsc;
}

__thread
U64_t _ia64_first_tsc = 0xffffffffffffffffUL;

static inline __attribute__((always_inline))
U64_t IA64_tsc_ticks_since_start()
{ if(_ia64_first_tsc == 0xffffffffffffffffUL)
  { _ia64_first_tsc = IA64_tsc_now();
    return 0;
  }
  return (IA64_tsc_now() - _ia64_first_tsc) ;
}

#define NSEC_PER_SEC 1000000000

static inline __attribute__((always_inline))
void
ia64_tsc_calc_mult_shift
( register U32_t *mult,
  register U32_t *shift
)
{ /* paraphrases Linux clocksource.c's clocks_calc_mult_shift() function:
   * calculates second + nanosecond mult + shift in same way linux does. 
   * It is a shame we cannot use numerically accurate values calculated as in
IA64_s_ns_since_start_b()
   * or IA64_s_ns_since_start_a() above, but they take much longer as they use
long registers, and 
   * we want to be compatible with what linux returns in struct timespec ts
after call to 
   * clock_gettime(CLOCK_MONOTONIC_RAW, &ts).
   */
  const U32_t scale=1000U;
  register U32_t from= IA64_tsc_khz();
  register U32_t to  = NSEC_PER_SEC / scale;
  register U64_t sec = ( ~0UL / from ) / scale;  
  sec = (sec > 600) ? 600 : ((sec > 0) ? sec : 1);
  register U64_t maxsec = sec * scale;
  UL_t tmp;
  U32_t sft, sftacc=32;
  /*
   * Calculate the shift factor which is limiting the conversion
   * range:
   */
  tmp = (maxsec * from) >> 32;
  while (tmp)
  { tmp >>=1;
    sftacc--;
  }
  /*
   * Find the conversion shift/mult pair which has the best
   * accuracy and fits the maxsec conversion range:
   */
  for (sft = 32; sft > 0; sft--) 
  { tmp = ((UL_t) to) << sft;
    tmp += from / 2;
    tmp = tmp / from;
    if ((tmp >> sftacc) == 0)
      break;
  }
  *mult = tmp;
  *shift = sft;
}

__thread
U32_t _ia64_tsc_mult = ~0U, _ia64_tsc_shift=~0U;

static inline __attribute__((always_inline))
U64_t IA64_s_ns_since_start()
{ if( ( _ia64_tsc_mult == ~0U ) || ( _ia64_tsc_shift == ~0U ) )
    ia64_tsc_calc_mult_shift( &_ia64_tsc_mult, &_ia64_tsc_shift);
  register U64_t cycles = IA64_tsc_ticks_since_start();   
  register U64_t ns = ((cycles *((UL_t)_ia64_tsc_mult))>>_ia64_tsc_shift);
  return( (((ns / NSEC_PER_SEC)&0xffffffffUL) << 32) | ((ns %
NSEC_PER_SEC)&0x3fffffffUL) );
  /* Yes, we are purposefully ignoring durations of more than 4.2 billion
seconds here! */
}
</pre></code></quote>

The last function above can be called with @ 10-15ns latency to
return the timestamp as calculated according to the same 
formula Linux uses to INITIALIZE the shift & mult values,
without entering the kernel.

It can be made more accurate if it uses the actual, UPDATED
shift & mult values stored in the vvar_vsyscall_gtod_data 
in the VDSO .

Unfortunately, this is a '*ABS*' type symbol :
$ objdump -t $BLD/linux-4.10/arch/x86/entry/vdso/vdso64.so.dbg | grep
vvar_vdso_vsys
ffffffffffffe080 l       *ABS*  0000000000000000             
vvar_vsyscall_gtod_data

This is the offset of 'vsyscall_gtod_data' within the kernel,
so one has to subtract the value of 'vdso_vvar_page' from it,
and add the value of the base address of the VDSO in the process
to the result, to obtain the vvar_vsyscall_gtod_data address.

It can ONLY be looked up from inspection of the debuginfo data
of the VDSO .

Even parsing the VDSO from the AUXV at startup, as at :
https://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/tree/Documentation/vDSO?id=v3.8
does not help, because the absolute value is stripped out of the VDSO
linked into every executable.

I think GLIBC should be using rdtscp directly instead of calling 
clock_gettime() through the VDSO, on platforms where all these
CPU capabilities are enabled:

tsc
rdtscp
constant_tsc
nonstop_tsc

and the VDSO debug object (vdso64.so.dbg) is at a well-known location 
and can be used to lookup the vdso_vvar_page and vvar_vsyscall_gtod_data
values,
so that conversion from rdtsc / rdtscp result to a (seconds,nanosecond)
value is done in an IDENTICAL WAY to clock_gettime() in the kernel,
but entirely in user-space and WITHOUT having to lock around the 
vvar_vsyscall_gtod_data, which is what causes the huge delay.  
The CLOCK_MONOTONIC_RAW value is meant to be the monotonically
increasing UNADJUSTED raw timestamp counter anyway, so why should users
be forced to wait for locking so that adjustments can occur ?

It imposes a HUGE overhead and floor on the minimum amount of 
time that can be measured by user-space processes on glibc
Linux systems.

Ubuntu's libc6 DOES redirect clock_gettime(CLOCK_MONOTONIC_RAW)  calls to a
function that invokes rdtscp in user-space also - why can't GLIBC 
do this ?

Minimum time taken between clock_gettime() calls on my system: 600ns
Minimum time taken to invoke rdtsc & convert result          :   8ns

-- 
You are receiving this mail because:
You are on the CC list for the bug.

Index Nav: [Date Index] [Subject Index] [Author Index] [Thread Index]
Message Nav: [Date Prev] [Date Next] [Thread Prev] [Thread Next]