Index: README-sx-ncx =================================================================== --- README-sx-ncx (.../trunk) (revision 0) +++ README-sx-ncx (.../branches/nec_sx_vector_ncx) (revision 531) @@ -0,0 +1,10 @@ +Changes to ncx.m4 to allow vectorisation of type conversions +on SX machines. These changes were originally made by Harvey +Davies (Unidata, CSIRO) and have been updated by me (sjl, NEC). + +Amongst other things, I have removed support for FLOAT1 and +FLOAT2 on the SX. Since these have not been supported since +SX-4 (ie at least two generations of machine) I think the +advantage of simplifying the patch outweighs the disadvantage +of ending support for programs using those compiler options. + Index: libsrc/ncx.m4 =================================================================== --- libsrc/ncx.m4 (.../trunk) (revision 531) +++ libsrc/ncx.m4 (.../branches/nec_sx_vector_ncx) (revision 531) @@ -1,3 +1,9 @@ +dnl +dnl sjl: this version of ncx.m4 has SX-specific optimisations as per +dnl Harveys mods to earlier versions. However, I have removed +dnl support for FLOAT2 and attemped to tidy up the mods to +dnl reduce the complexity a bit +dnl dnl This is m4 source. dnl Process using m4 to produce 'C' language file. dnl @@ -47,7 +53,17 @@ # error "You will need to define FLT_MAX" # endif #endif +/* alias poorly named float.h macros */ +#define FLOAT_MAX FLT_MAX +#define FLOAT_MIN (-FLT_MAX) +#define DOUBLE_MAX DBL_MAX +#define DOUBLE_MIN (-DBL_MAX) +#define FLOAT_MAX_EXP FLT_MAX_EXP +#define DOUBLE_MAX_EXP DBL_MAX_EXP #include +#define UCHAR_MIN 0 +#define Min(a,b) ((a) < (b) ? (a) : (b)) +#define Max(a,b) ((a) > (b) ? (a) : (b)) /* * If the machine's float domain is "smaller" than the external one @@ -61,6 +77,7 @@ #endif #if _SX /* NEC SUPER UX */ +#define LOOPCNT 256 /* must be no longer than hardware vector length */ #if _INT64 #undef INT_MAX /* workaround cpp bug */ #define INT_MAX X_INT_MAX @@ -76,6 +93,9 @@ #undef LONG_MIN /* workaround cpp bug */ #define LONG_MIN -4294967295L #endif +#if !_FLOAT0 +#error "FLOAT1 and FLOAT2 not supported" +#endif #endif /* _SX */ static const char nada[X_ALIGN] = {0, 0, 0, 0}; @@ -1074,18 +1094,6 @@ } #endif -#elif _SX && _FLOAT2 -static void -get_ix_float(const void *xp, float *ip) -{ - const int ncnv = ie3_fl2(xp, ip, 4, 8, 1); -} - -static void -put_ix_float(void *xp, const float *ip) -{ - const int ncnv = fl2_ie3(ip, xp, 8, 4, 1); -} #else #error "ix_float implementation" #endif @@ -1511,18 +1519,6 @@ idp->mant = 0; } } -#elif _SX && _FLOAT2 -static void -get_ix_double(const void *xp, double *ip) -{ - const int ncnv = ie3_fl2(xp, ip, 8, 8, 1); -} - -static void -put_ix_double(void *xp, const double *ip) -{ - const int ncnv = fl2_ie3(ip, xp, 8, 8, 1); -} #else #error "ix_double implementation" #endif @@ -1839,11 +1835,17 @@ define(`Upcase',dnl `dnl translit($1, abcdefghijklmnopqrstuvwxyz, ABCDEFGHIJKLMNOPQRSTUVWXYZ)')dnl +dnl dnl dnl dnl dnl -dnl Xsizeof(Xtype) -dnl +dnl sjl: some macros used for putn/getn: +define(`Isizeof', ``SIZEOF_'Upcase($1)')dnl define(`Xsizeof', ``X_SIZEOF_'Upcase($1)')dnl +define(`Imax', `Upcase($1)`_MAX'')dnl +define(`Imin', `Upcase($1)`_MIN'')dnl +define(`Xmax', ``X_'Upcase($1)`_MAX'')dnl +define(`Xmin', ``X_'Upcase($1)`_MIN'')dnl +define(`ImaxExp', `Upcase($1)`_MAX_EXP'')dnl dnl dnl dnl dnl dnl @@ -1949,6 +1951,52 @@ int ncx_getn_$1_$2(const void **xpp, size_t nelems, $2 *tp) { +#if _SX && \ + Xsizeof($1) == Isizeof($1) + + /* basic algorithm is: + * - ensure sane alignment of input data + * - copy (conversion happens automatically) input data + * to output + * - update xpp to point at next unconverted input, and tp to point + * at next location for converted output + */ + long i, j, ni; + $1 tmp[LOOPCNT]; /* in case input is misaligned */ + $1 *xp; + int nrange = 0; /* number of range errors */ + int realign = 0; /* "do we need to fix input data alignment?" */ + long cxp = (long) *((char**)xpp); + + realign = (cxp & 7) % Isizeof($1); + /* sjl: manually stripmine so we can limit amount of + * vector work space reserved to LOOPCNT elements. Also + * makes vectorisation easy */ + for (j=0; j Imax($2); + } + /* update xpp and tp */ + if (realign) xp = ($1 *) *xpp; + xp += ni; + tp += ni; + *xpp = (void*)xp; + } + return nrange == 0 ? ENOERR : NC_ERANGE; + +#else /* not SX */ const char *xp = (const char *) *xpp; int status = ENOERR; @@ -1961,6 +2009,7 @@ *xpp = (const void *)xp; return status; +# endif } ')dnl dnl dnl dnl @@ -2118,6 +2167,70 @@ int ncx_putn_$1_$2(void **xpp, size_t nelems, const $2 *tp) { +#if _SX && \ + Xsizeof($1) == Isizeof($1) + + /* basic algorithm is: + * - ensure sane alignment of output data + * - copy (conversion happens automatically) input data + * to output + * - update tp to point at next unconverted input, and xpp to point + * at next location for converted output + */ + long i, j, ni; + $1 tmp[LOOPCNT]; /* in case input is misaligned */ + $1 *xp; +ifelse( $1$2, intfloat,dnl +`dnl + double d; /* special case for ncx_putn_int_float */ +')dnl + int nrange = 0; /* number of range errors */ + int realign = 0; /* "do we need to fix input data alignment?" */ + long cxp = (long) *((char**)xpp); + + realign = (cxp & 7) % Isizeof($1); + /* sjl: manually stripmine so we can limit amount of + * vector work space reserved to LOOPCNT elements. Also + * makes vectorisation easy */ + for (j=0; j Xmax($1); +',dnl +`dnl + /* the normal case: */ + xp[i] = ($1) Max( Xmin($1), Min(Xmax($1), ($1) tp[i])); + /* test for range errors (not always needed but do it anyway) */ + nrange += tp[i] < Xmin($1) || tp[i] > Xmax($1); +')dnl + } + /* copy workspace back if necessary */ + if (realign) { + memcpy(*xpp, tmp, ni*Xsizeof($1)); + xp = ($1 *) *xpp; + } + /* update xpp and tp */ + xp += ni; + tp += ni; + *xpp = (void*)xp; + } + return nrange == 0 ? ENOERR : NC_ERANGE; + +#else /* not SX */ + char *xp = (char *) *xpp; int status = ENOERR; @@ -2130,6 +2243,7 @@ *xpp = (void *)xp; return status; +#endif } ')dnl dnl dnl dnl @@ -2425,17 +2539,6 @@ } return ENOERR; } -#elif _SX && _FLOAT2 -int -ncx_getn_float_float(const void **xpp, size_t nelems, float *tp) -{ - const char *const xp = *xpp; - - const int ncnv = ie3_fl2(xp, tp, 4, 8, nelems); - - *xpp = xp + nelems * X_SIZEOF_FLOAT; - return (nelems == ncnv ? ENOERR : NC_ERANGE); -} #else int ncx_getn_float_float(const void **xpp, size_t nelems, float *tp) @@ -2490,17 +2593,6 @@ } return ENOERR; } -#elif _SX && _FLOAT2 -int -ncx_putn_float_float(void **xpp, size_t nelems, const float *tp) -{ - char *const xp = *xpp; - - const int ncnv = fl2_ie3(tp, xp, 8, 4, nelems); - - *xpp = xp + nelems * X_SIZEOF_FLOAT; - return (nelems == ncnv ? ENOERR : NC_ERANGE); -} #else int ncx_putn_float_float(void **xpp, size_t nelems, const float *tp) @@ -2560,17 +2652,6 @@ return ENOERR; } /* vax */ -#elif _SX && _FLOAT2 -int -ncx_getn_double_double(const void **xpp, size_t nelems, double *tp) -{ - const char *const xp = *xpp; - - const int ncnv = ie3_fl2(xp, tp, 8, 8, nelems); - - *xpp = xp + nelems * X_SIZEOF_DOUBLE; - return (nelems == ncnv ? ENOERR : NC_ERANGE); -} #else int ncx_getn_double_double(const void **xpp, size_t nelems, double *tp) @@ -2626,17 +2707,6 @@ return ENOERR; } /* vax */ -#elif _SX && _FLOAT2 -int -ncx_putn_double_double(void **xpp, size_t nelems, const double *tp) -{ - char *const xp = *xpp; - - const int ncnv = fl2_ie3(tp, xp, 8, 8, nelems); - - *xpp = xp + nelems * X_SIZEOF_DOUBLE; - return (nelems == ncnv ? ENOERR : NC_ERANGE); -} #else int ncx_putn_double_double(void **xpp, size_t nelems, const double *tp)