Upload folder using huggingface_hub

eb00867 verified about 1 month ago

12.2 kB

	/*
	* Copyright 1993-2012 NVIDIA Corporation. All rights reserved.
	*
	* NOTICE TO LICENSEE:
	*
	* This source code and/or documentation ("Licensed Deliverables") are
	* subject to NVIDIA intellectual property rights under U.S. and
	* international Copyright laws.
	*
	* These Licensed Deliverables contained herein is PROPRIETARY and
	* CONFIDENTIAL to NVIDIA and is being provided under the terms and
	* conditions of a form of NVIDIA software license agreement by and
	* between NVIDIA and Licensee ("License Agreement") or electronically
	* accepted by Licensee. Notwithstanding any terms or conditions to
	* the contrary in the License Agreement, reproduction or disclosure
	* of the Licensed Deliverables to any third party without the express
	* written consent of NVIDIA is prohibited.
	*
	* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
	* LICENSE AGREEMENT, NVIDIA MAKES NO REPRESENTATION ABOUT THE
	* SUITABILITY OF THESE LICENSED DELIVERABLES FOR ANY PURPOSE. IT IS
	* PROVIDED "AS IS" WITHOUT EXPRESS OR IMPLIED WARRANTY OF ANY KIND.
	* NVIDIA DISCLAIMS ALL WARRANTIES WITH REGARD TO THESE LICENSED
	* DELIVERABLES, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY,
	* NONINFRINGEMENT, AND FITNESS FOR A PARTICULAR PURPOSE.
	* NOTWITHSTANDING ANY TERMS OR CONDITIONS TO THE CONTRARY IN THE
	* LICENSE AGREEMENT, IN NO EVENT SHALL NVIDIA BE LIABLE FOR ANY
	* SPECIAL, INDIRECT, INCIDENTAL, OR CONSEQUENTIAL DAMAGES, OR ANY
	* DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS,
	* WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
	* ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
	* OF THESE LICENSED DELIVERABLES.
	*
	* U.S. Government End Users. These Licensed Deliverables are a
	* "commercial item" as that term is defined at 48 C.F.R. 2.101 (OCT
	* 1995), consisting of "commercial computer software" and "commercial
	* computer software documentation" as such terms are used in 48
	* C.F.R. 12.212 (SEPT 1995) and is provided to the U.S. Government
	* only as a commercial end item. Consistent with 48 C.F.R.12.212 and
	* 48 C.F.R. 227.7202-1 through 227.7202-4 (JUNE 1995), all
	* U.S. Government End Users acquire the Licensed Deliverables with
	* only those rights set forth herein.
	*
	* Any use of the Licensed Deliverables in individual and commercial
	* software must include, in the user documentation and internal
	* comments to the code, the above Disclaimer and U.S. Government End
	* Users Notice.
	*/

	#if !defined(CU_COMPLEX_H_)
	#define CU_COMPLEX_H_

	#if !defined(__CUDACC_RTC__)
	#if defined(__GNUC__)
	#if defined(__clang__) \|\| (!defined(__PGIC__) && (__GNUC__ > 4 \|\| (__GNUC__ == 4 && __GNUC_MINOR__ >= 2)))
	#pragma GCC diagnostic ignored "-Wunused-function"
	#endif
	#endif
	#endif

	/* When trying to include C header file in C++ Code extern "C" is required
	* But the Standard QNX headers already have ifdef extern in them when compiling C++ Code
	* extern "C" cannot be nested
	* Hence keep the header out of extern "C" block
	*/

	#if !defined(__CUDACC__)
	#include <math.h> /* import fabsf, sqrt */
	#endif /* !defined(__CUDACC__) */

	#if defined(__cplusplus)
	extern "C" {
	#endif /* __cplusplus */

	#include "vector_types.h"

	typedef float2 cuFloatComplex;

	__host__ __device__ static __inline__ float cuCrealf (cuFloatComplex x)
	{
	return x.x;
	}

	__host__ __device__ static __inline__ float cuCimagf (cuFloatComplex x)
	{
	return x.y;
	}

	__host__ __device__ static __inline__ cuFloatComplex make_cuFloatComplex
	(float r, float i)
	{
	cuFloatComplex res;
	res.x = r;
	res.y = i;
	return res;
	}

	__host__ __device__ static __inline__ cuFloatComplex cuConjf (cuFloatComplex x)
	{
	return make_cuFloatComplex (cuCrealf(x), -cuCimagf(x));
	}
	__host__ __device__ static __inline__ cuFloatComplex cuCaddf (cuFloatComplex x,
	cuFloatComplex y)
	{
	return make_cuFloatComplex (cuCrealf(x) + cuCrealf(y),
	cuCimagf(x) + cuCimagf(y));
	}

	__host__ __device__ static __inline__ cuFloatComplex cuCsubf (cuFloatComplex x,
	cuFloatComplex y)
	{
	return make_cuFloatComplex (cuCrealf(x) - cuCrealf(y),
	cuCimagf(x) - cuCimagf(y));
	}

	/* This implementation could suffer from intermediate overflow even though
	* the final result would be in range. However, various implementations do
	* not guard against this (presumably to avoid losing performance), so we
	* don't do it either to stay competitive.
	*/
	__host__ __device__ static __inline__ cuFloatComplex cuCmulf (cuFloatComplex x,
	cuFloatComplex y)
	{
	cuFloatComplex prod;
	prod = make_cuFloatComplex ((cuCrealf(x) * cuCrealf(y)) -
	(cuCimagf(x) * cuCimagf(y)),
	(cuCrealf(x) * cuCimagf(y)) +
	(cuCimagf(x) * cuCrealf(y)));
	return prod;
	}

	/* This implementation guards against intermediate underflow and overflow
	* by scaling. Such guarded implementations are usually the default for
	* complex library implementations, with some also offering an unguarded,
	* faster version.
	*/
	__host__ __device__ static __inline__ cuFloatComplex cuCdivf (cuFloatComplex x,
	cuFloatComplex y)
	{
	cuFloatComplex quot;
	float s = fabsf(cuCrealf(y)) + fabsf(cuCimagf(y));
	float oos = 1.0f / s;
	float ars = cuCrealf(x) * oos;
	float ais = cuCimagf(x) * oos;
	float brs = cuCrealf(y) * oos;
	float bis = cuCimagf(y) * oos;
	s = (brs * brs) + (bis * bis);
	oos = 1.0f / s;
	quot = make_cuFloatComplex (((ars * brs) + (ais * bis)) * oos,
	((ais * brs) - (ars * bis)) * oos);
	return quot;
	}

	/*
	* We would like to call hypotf(), but it's not available on all platforms.
	* This discrete implementation guards against intermediate underflow and
	* overflow by scaling. Otherwise we would lose half the exponent range.
	* There are various ways of doing guarded computation. For now chose the
	* simplest and fastest solution, however this may suffer from inaccuracies
	* if sqrt and division are not IEEE compliant.
	*/
	__host__ __device__ static __inline__ float cuCabsf (cuFloatComplex x)
	{
	float a = cuCrealf(x);
	float b = cuCimagf(x);
	float v, w, t;
	a = fabsf(a);
	b = fabsf(b);
	if (a > b) {
	v = a;
	w = b;
	} else {
	v = b;
	w = a;
	}
	t = w / v;
	t = 1.0f + t * t;
	t = v * sqrtf(t);
	if ((v == 0.0f) \|\| (v > 3.402823466e38f) \|\| (w > 3.402823466e38f)) {
	t = v + w;
	}
	return t;
	}

	/* Double precision */
	typedef double2 cuDoubleComplex;

	__host__ __device__ static __inline__ double cuCreal (cuDoubleComplex x)
	{
	return x.x;
	}

	__host__ __device__ static __inline__ double cuCimag (cuDoubleComplex x)
	{
	return x.y;
	}

	__host__ __device__ static __inline__ cuDoubleComplex make_cuDoubleComplex
	(double r, double i)
	{
	cuDoubleComplex res;
	res.x = r;
	res.y = i;
	return res;
	}

	__host__ __device__ static __inline__ cuDoubleComplex cuConj(cuDoubleComplex x)
	{
	return make_cuDoubleComplex (cuCreal(x), -cuCimag(x));
	}

	__host__ __device__ static __inline__ cuDoubleComplex cuCadd(cuDoubleComplex x,
	cuDoubleComplex y)
	{
	return make_cuDoubleComplex (cuCreal(x) + cuCreal(y),
	cuCimag(x) + cuCimag(y));
	}

	__host__ __device__ static __inline__ cuDoubleComplex cuCsub(cuDoubleComplex x,
	cuDoubleComplex y)
	{
	return make_cuDoubleComplex (cuCreal(x) - cuCreal(y),
	cuCimag(x) - cuCimag(y));
	}

	/* This implementation could suffer from intermediate overflow even though
	* the final result would be in range. However, various implementations do
	* not guard against this (presumably to avoid losing performance), so we
	* don't do it either to stay competitive.
	*/
	__host__ __device__ static __inline__ cuDoubleComplex cuCmul(cuDoubleComplex x,
	cuDoubleComplex y)
	{
	cuDoubleComplex prod;
	prod = make_cuDoubleComplex ((cuCreal(x) * cuCreal(y)) -
	(cuCimag(x) * cuCimag(y)),
	(cuCreal(x) * cuCimag(y)) +
	(cuCimag(x) * cuCreal(y)));
	return prod;
	}

	/* This implementation guards against intermediate underflow and overflow
	* by scaling. Such guarded implementations are usually the default for
	* complex library implementations, with some also offering an unguarded,
	* faster version.
	*/
	__host__ __device__ static __inline__ cuDoubleComplex cuCdiv(cuDoubleComplex x,
	cuDoubleComplex y)
	{
	cuDoubleComplex quot;
	double s = (fabs(cuCreal(y))) + (fabs(cuCimag(y)));
	double oos = 1.0 / s;
	double ars = cuCreal(x) * oos;
	double ais = cuCimag(x) * oos;
	double brs = cuCreal(y) * oos;
	double bis = cuCimag(y) * oos;
	s = (brs * brs) + (bis * bis);
	oos = 1.0 / s;
	quot = make_cuDoubleComplex (((ars * brs) + (ais * bis)) * oos,
	((ais * brs) - (ars * bis)) * oos);
	return quot;
	}

	/* This implementation guards against intermediate underflow and overflow
	* by scaling. Otherwise we would lose half the exponent range. There are
	* various ways of doing guarded computation. For now chose the simplest
	* and fastest solution, however this may suffer from inaccuracies if sqrt
	* and division are not IEEE compliant.
	*/
	__host__ __device__ static __inline__ double cuCabs (cuDoubleComplex x)
	{
	double a = cuCreal(x);
	double b = cuCimag(x);
	double v, w, t;
	a = fabs(a);
	b = fabs(b);
	if (a > b) {
	v = a;
	w = b;
	} else {
	v = b;
	w = a;
	}
	t = w / v;
	t = 1.0 + t * t;
	t = v * sqrt(t);
	if ((v == 0.0) \|\|
	(v > 1.79769313486231570e+308) \|\| (w > 1.79769313486231570e+308)) {
	t = v + w;
	}
	return t;
	}

	#if defined(__cplusplus)
	}
	#endif /* __cplusplus */

	/* aliases */
	typedef cuFloatComplex cuComplex;
	__host__ __device__ static __inline__ cuComplex make_cuComplex (float x,
	float y)
	{
	return make_cuFloatComplex (x, y);
	}

	/* float-to-double promotion */
	__host__ __device__ static __inline__ cuDoubleComplex cuComplexFloatToDouble
	(cuFloatComplex c)
	{
	return make_cuDoubleComplex ((double)cuCrealf(c), (double)cuCimagf(c));
	}

	__host__ __device__ static __inline__ cuFloatComplex cuComplexDoubleToFloat
	(cuDoubleComplex c)
	{
	return make_cuFloatComplex ((float)cuCreal(c), (float)cuCimag(c));
	}


	__host__ __device__ static __inline__ cuComplex cuCfmaf( cuComplex x, cuComplex y, cuComplex d)
	{
	float real_res;
	float imag_res;

	real_res = (cuCrealf(x) * cuCrealf(y)) + cuCrealf(d);
	imag_res = (cuCrealf(x) * cuCimagf(y)) + cuCimagf(d);

	real_res = -(cuCimagf(x) * cuCimagf(y)) + real_res;
	imag_res = (cuCimagf(x) * cuCrealf(y)) + imag_res;

	return make_cuComplex(real_res, imag_res);
	}

	__host__ __device__ static __inline__ cuDoubleComplex cuCfma( cuDoubleComplex x, cuDoubleComplex y, cuDoubleComplex d)
	{
	double real_res;
	double imag_res;

	real_res = (cuCreal(x) * cuCreal(y)) + cuCreal(d);
	imag_res = (cuCreal(x) * cuCimag(y)) + cuCimag(d);

	real_res = -(cuCimag(x) * cuCimag(y)) + real_res;
	imag_res = (cuCimag(x) * cuCreal(y)) + imag_res;

	return make_cuDoubleComplex(real_res, imag_res);
	}

	#endif /* !defined(CU_COMPLEX_H_) */