[149125] trunk/dports/science/volk
michaelld at macports.org
michaelld at macports.org
Tue May 31 17:36:06 PDT 2016
Revision: 149125
https://trac.macports.org/changeset/149125
Author: michaelld at macports.org
Date: 2016-05-31 17:36:06 -0700 (Tue, 31 May 2016)
Log Message:
-----------
volk: add a temporary patchfile to correct API for volk_32f_index_max_16u to be the same as that provided by volk-devel, and thus correct with respect to usage by other ports.
Modified Paths:
--------------
trunk/dports/science/volk/Portfile
Added Paths:
-----------
trunk/dports/science/volk/files/patch-update_1.2.2_to_current.diff
Modified: trunk/dports/science/volk/Portfile
===================================================================
--- trunk/dports/science/volk/Portfile 2016-05-31 22:17:24 UTC (rev 149124)
+++ trunk/dports/science/volk/Portfile 2016-06-01 00:36:06 UTC (rev 149125)
@@ -30,6 +30,12 @@
provides the release version, which is typically updated every month or so.
conflicts volk-devel
+ # temporary patchfile to correct API for volk_32f_index_max_16u to
+ # be the same as that provided by volk-devel, and thus correct
+ # with respect to usage by other ports.
+
+ patchfiles-append patch-update_1.2.2_to_current.diff
+
}
subport volk-devel {
Added: trunk/dports/science/volk/files/patch-update_1.2.2_to_current.diff
===================================================================
--- trunk/dports/science/volk/files/patch-update_1.2.2_to_current.diff (rev 0)
+++ trunk/dports/science/volk/files/patch-update_1.2.2_to_current.diff 2016-06-01 00:36:06 UTC (rev 149125)
@@ -0,0 +1,1295 @@
+--- CMakeLists.txt.orig
++++ CMakeLists.txt
+@@ -215,6 +215,11 @@ endif()
+ ########################################################################
+
+ configure_file(
++ ${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake.in
++ ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfig.cmake
++ at ONLY)
++
++configure_file(
+ ${CMAKE_SOURCE_DIR}/cmake/Modules/VolkConfigVersion.cmake.in
+ ${CMAKE_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake
+ @ONLY)
+@@ -230,7 +235,7 @@ endif(NOT CMAKE_MODULES_DIR)
+
+ install(
+ FILES
+- ${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules/VolkConfig.cmake
++ ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfig.cmake
+ ${CMAKE_CURRENT_BINARY_DIR}/cmake/Modules/VolkConfigVersion.cmake
+ DESTINATION ${CMAKE_MODULES_DIR}/volk
+ COMPONENT "volk_devel"
+--- apps/volk-config-info.cc.orig
++++ apps/volk-config-info.cc
+@@ -1,6 +1,6 @@
+ /* -*- c++ -*- */
+ /*
+- * Copyright 2013 Free Software Foundation, Inc.
++ * Copyright 2013, 2016 Free Software Foundation, Inc.
+ *
+ * This file is part of GNU Radio
+ *
+@@ -45,6 +45,8 @@ main(int argc, char **argv)
+ ("all-machines", "print VOLK machines built into library")
+ ("avail-machines", "print VOLK machines the current platform can use")
+ ("machine", "print the VOLK machine that will be used")
++ ("alignment", "print the alignment that will be used")
++ ("malloc", "print malloc implementation that will be used")
+ ("version,v", "print VOLK version")
+ ;
+
+@@ -88,5 +90,22 @@ main(int argc, char **argv)
+ std::cout << volk_get_machine() << std::endl;
+ }
+
++ if(vm.count("alignment")) {
++ std::cout << "Alignment in bytes: " << volk_get_alignment() << std::endl;
++ }
++
++ // You don't want to change the volk_malloc code, so just copy the if/else
++ // structure from there and give an explanation for the implementations
++ if(vm.count("malloc")) {
++ std::cout << "Used malloc implementation: ";
++#if _POSIX_C_SOURCE >= 200112L || _XOPEN_SOURCE >= 600 || HAVE_POSIX_MEMALIGN
++ std::cout << "posix_memalign" << std::endl;
++#elif _MSC_VER >= 1400
++ std::cout << "aligned_malloc" << std::endl;
++#else
++ std::cout << "No standard handler available, using own implementation." << std::endl;
++#endif
++ }
++
+ return 0;
+ }
+--- cmake/Modules/VolkConfig.cmake
++++ /dev/null
+@@ -1,26 +0,0 @@
+-INCLUDE(FindPkgConfig)
+-PKG_CHECK_MODULES(PC_VOLK volk)
+-
+-FIND_PATH(
+- VOLK_INCLUDE_DIRS
+- NAMES volk/volk.h
+- HINTS $ENV{VOLK_DIR}/include
+- ${PC_VOLK_INCLUDEDIR}
+- PATHS /usr/local/include
+- /usr/include
+-)
+-
+-FIND_LIBRARY(
+- VOLK_LIBRARIES
+- NAMES volk
+- HINTS $ENV{VOLK_DIR}/lib
+- ${PC_VOLK_LIBDIR}
+- PATHS /usr/local/lib
+- /usr/local/lib64
+- /usr/lib
+- /usr/lib64
+-)
+-
+-INCLUDE(FindPackageHandleStandardArgs)
+-FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
+-MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
+--- /dev/null
++++ cmake/Modules/VolkConfig.cmake.in
+@@ -0,0 +1,28 @@
++INCLUDE(FindPkgConfig)
++PKG_CHECK_MODULES(PC_VOLK volk)
++
++FIND_PATH(
++ VOLK_INCLUDE_DIRS
++ NAMES volk/volk.h
++ HINTS $ENV{VOLK_DIR}/include
++ ${PC_VOLK_INCLUDEDIR}
++ PATHS /usr/local/include
++ /usr/include
++ "@CMAKE_INSTALL_PREFIX@/include"
++)
++
++FIND_LIBRARY(
++ VOLK_LIBRARIES
++ NAMES volk
++ HINTS $ENV{VOLK_DIR}/lib
++ ${PC_VOLK_LIBDIR}
++ PATHS /usr/local/lib
++ /usr/local/lib64
++ /usr/lib
++ /usr/lib64
++ "@CMAKE_INSTALL_PREFIX@/lib"
++)
++
++INCLUDE(FindPackageHandleStandardArgs)
++FIND_PACKAGE_HANDLE_STANDARD_ARGS(VOLK DEFAULT_MSG VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
++MARK_AS_ADVANCED(VOLK_LIBRARIES VOLK_INCLUDE_DIRS)
+--- docs/kernels.dox.orig
++++ docs/kernels.dox
+@@ -47,6 +47,7 @@
+ \li \subpage volk_32fc_deinterleave_real_32f
+ \li \subpage volk_32fc_deinterleave_real_64f
+ \li \subpage volk_32fc_index_max_16u
++\li \subpage volk_32fc_index_max_32u
+ \li \subpage volk_32fc_magnitude_32f
+ \li \subpage volk_32fc_magnitude_squared_32f
+ \li \subpage volk_32f_cos_32f
+@@ -61,6 +62,7 @@
+ \li \subpage volk_32fc_x2_square_dist_32f
+ \li \subpage volk_32f_expfast_32f
+ \li \subpage volk_32f_index_max_16u
++\li \subpage volk_32f_index_max_32u
+ \li \subpage volk_32f_invsqrt_32f
+ \li \subpage volk_32f_log2_32f
+ \li \subpage volk_32f_s32f_calc_spectral_noise_floor_32f
+--- kernels/volk/volk_32f_index_max_16u.h.orig
++++ kernels/volk/volk_32f_index_max_16u.h
+@@ -25,11 +25,18 @@
+ *
+ * \b Overview
+ *
+- * Returns Argmax_i x[i]. Finds and returns the index which contains the maximum value in the given vector.
++ * Returns Argmax_i x[i]. Finds and returns the index which contains
++ * the maximum value in the given vector.
++ *
++ * Note that num_points is a uint32_t, but the return value is
++ * uint16_t. Providing a vector larger than the max of a uint16_t
++ * (65536) would miss anything outside of this boundary. The kernel
++ * will check the length of num_points and cap it to this max value,
++ * anyways.
+ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points)
++ * void volk_32f_index_max_16u(uint16_t* target, const float* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+@@ -42,11 +49,11 @@
+ * \b Example
+ * \code
+ * int N = 10;
+- * unsigned int alignment = volk_get_alignment();
++ * uint32_t alignment = volk_get_alignment();
+ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
+- * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
++ * uint16_t* out = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+- * for(unsigned int ii = 0; ii < N; ++ii){
++ * for(uint32_t ii = 0; ii < N; ++ii){
+ * float x = (float)ii;
+ * // a parabola with a maximum at x=4
+ * in[ii] = -(x-4) * (x-4) + 5;
+@@ -67,64 +74,66 @@
+ #include <volk/volk_common.h>
+ #include <volk/volk_common.h>
+ #include <inttypes.h>
++#include <limits.h>
+ #include <stdio.h>
+
+ #ifdef LV_HAVE_SSE4_1
+-#include<smmintrin.h>
++#include <smmintrin.h>
+
+ static inline void
+-volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigned int num_points)
++volk_32f_index_max_16u_a_sse4_1(uint16_t* target, const float* src0,
++ uint32_t num_points)
+ {
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+- float* inputPtr = (float*)src0;
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
+
+- __m128 indexIncrementValues = _mm_set1_ps(4);
+- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++ float* inputPtr = (float*)src0;
+
+- float max = src0[0];
+- float index = 0;
+- __m128 maxValues = _mm_set1_ps(max);
+- __m128 maxValuesIndex = _mm_setzero_ps();
+- __m128 compareResults;
+- __m128 currentValues;
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
+
+- for(;number < quarterPoints; number++){
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++ for(;number < quarterPoints; number++){
+
+- compareResults = _mm_cmpgt_ps(maxValues, currentValues);
++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+- maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
+- maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
+- }
++ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+- // Calculate the largest value from the remaining 4 points
+- _mm_store_ps(maxValuesBuffer, maxValues);
+- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
++ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
++ }
+
+- for(number = 0; number < 4; number++){
+- if(maxValuesBuffer[number] > max){
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- }
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for(number = 0; number < 4; number++){
++ if(maxValuesBuffer[number] > max){
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
+ }
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- if(src0[number] > max){
+- index = number;
+- max = src0[number];
+- }
++ number = quarterPoints * 4;
++ for(;number < num_points; number++){
++ if(src0[number] > max){
++ index = number;
++ max = src0[number];
+ }
+- target[0] = (unsigned int)index;
+ }
++ target[0] = (uint16_t)index;
+ }
+
+ #endif /*LV_HAVE_SSE4_1*/
+@@ -132,62 +141,63 @@ volk_32f_index_max_16u_a_sse4_1(unsigned int* target, const float* src0, unsigne
+
+ #ifdef LV_HAVE_SSE
+
+-#include<xmmintrin.h>
++#include <xmmintrin.h>
+
+ static inline void
+-volk_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned int num_points)
++volk_32f_index_max_16u_a_sse(uint16_t* target, const float* src0,
++ uint32_t num_points)
+ {
+- if(num_points > 0){
+- unsigned int number = 0;
+- const unsigned int quarterPoints = num_points / 4;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
+
+- float* inputPtr = (float*)src0;
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
+
+- __m128 indexIncrementValues = _mm_set1_ps(4);
+- __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++ float* inputPtr = (float*)src0;
+
+- float max = src0[0];
+- float index = 0;
+- __m128 maxValues = _mm_set1_ps(max);
+- __m128 maxValuesIndex = _mm_setzero_ps();
+- __m128 compareResults;
+- __m128 currentValues;
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
+
+- __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
+- __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
+
+- for(;number < quarterPoints; number++){
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
+
+- currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
+- currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++ for(;number < quarterPoints; number++){
+
+- compareResults = _mm_cmpgt_ps(maxValues, currentValues);
++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
+
+- maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
++ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
+
+- maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
+- }
++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
++
++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
++ }
+
+- // Calculate the largest value from the remaining 4 points
+- _mm_store_ps(maxValuesBuffer, maxValues);
+- _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
+
+- for(number = 0; number < 4; number++){
+- if(maxValuesBuffer[number] > max){
+- index = maxIndexesBuffer[number];
+- max = maxValuesBuffer[number];
+- }
++ for(number = 0; number < 4; number++){
++ if(maxValuesBuffer[number] > max){
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
+ }
++ }
+
+- number = quarterPoints * 4;
+- for(;number < num_points; number++){
+- if(src0[number] > max){
+- index = number;
+- max = src0[number];
+- }
++ number = quarterPoints * 4;
++ for(;number < num_points; number++){
++ if(src0[number] > max){
++ index = number;
++ max = src0[number];
+ }
+- target[0] = (unsigned int)index;
+ }
++ target[0] = (uint16_t)index;
+ }
+
+ #endif /*LV_HAVE_SSE*/
+@@ -196,22 +206,23 @@ volk_32f_index_max_16u_a_sse(unsigned int* target, const float* src0, unsigned i
+ #ifdef LV_HAVE_GENERIC
+
+ static inline void
+-volk_32f_index_max_16u_generic(unsigned int* target, const float* src0, unsigned int num_points)
++volk_32f_index_max_16u_generic(uint16_t* target, const float* src0,
++ uint32_t num_points)
+ {
+- if(num_points > 0){
+- float max = src0[0];
+- unsigned int index = 0;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
++
++ float max = src0[0];
++ uint16_t index = 0;
+
+- unsigned int i = 1;
++ uint32_t i = 1;
+
+- for(; i < num_points; ++i) {
+- if(src0[i] > max){
+- index = i;
+- max = src0[i];
+- }
++ for(; i < num_points; ++i) {
++ if(src0[i] > max) {
++ index = i;
++ max = src0[i];
+ }
+- target[0] = index;
+ }
++ target[0] = index;
+ }
+
+ #endif /*LV_HAVE_GENERIC*/
+--- /dev/null
++++ kernels/volk/volk_32f_index_max_32u.h
+@@ -0,0 +1,220 @@
++/* -*- c++ -*- */
++/*
++ * Copyright 2016 Free Software Foundation, Inc.
++ *
++ * This file is part of GNU Radio
++ *
++ * GNU Radio is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 3, or (at your option)
++ * any later version.
++ *
++ * GNU Radio is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNU Radio; see the file COPYING. If not, write to
++ * the Free Software Foundation, Inc., 51 Franklin Street,
++ * Boston, MA 02110-1301, USA.
++ */
++
++/*!
++ * \page volk_32f_index_max_32u
++ *
++ * \b Overview
++ *
++ * Returns Argmax_i x[i]. Finds and returns the index which contains the maximum value in the given vector.
++ *
++ * <b>Dispatcher Prototype</b>
++ * \code
++ * void volk_32f_index_max_32u(uint32_t* target, const float* src0, uint32_t num_points)
++ * \endcode
++ *
++ * \b Inputs
++ * \li src0: The input vector of floats.
++ * \li num_points: The number of data points.
++ *
++ * \b Outputs
++ * \li target: The index of the maximum value in the input buffer.
++ *
++ * \b Example
++ * \code
++ * int N = 10;
++ * uint32_t alignment = volk_get_alignment();
++ * float* in = (float*)volk_malloc(sizeof(float)*N, alignment);
++ * uint32_t* out = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
++ *
++ * for(uint32_t ii = 0; ii < N; ++ii){
++ * float x = (float)ii;
++ * // a parabola with a maximum at x=4
++ * in[ii] = -(x-4) * (x-4) + 5;
++ * }
++ *
++ * volk_32f_index_max_32u(out, in, N);
++ *
++ * printf("maximum is %1.2f at index %u\n", in[*out], *out);
++ *
++ * volk_free(in);
++ * volk_free(out);
++ * \endcode
++ */
++
++#ifndef INCLUDED_volk_32f_index_max_32u_a_H
++#define INCLUDED_volk_32f_index_max_32u_a_H
++
++#include <volk/volk_common.h>
++#include <volk/volk_common.h>
++#include <inttypes.h>
++#include <stdio.h>
++
++#ifdef LV_HAVE_SSE4_1
++#include<smmintrin.h>
++
++static inline void
++volk_32f_index_max_32u_a_sse4_1(uint32_t* target, const float* src0, uint32_t num_points)
++{
++ if(num_points > 0){
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
++
++ float* inputPtr = (float*)src0;
++
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
++
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++
++ for(;number < quarterPoints; number++){
++
++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++
++ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
++
++ maxValuesIndex = _mm_blendv_ps(currentIndexes, maxValuesIndex, compareResults);
++ maxValues = _mm_blendv_ps(currentValues, maxValues, compareResults);
++ }
++
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for(number = 0; number < 4; number++){
++ if(maxValuesBuffer[number] > max){
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ }
++ }
++
++ number = quarterPoints * 4;
++ for(;number < num_points; number++){
++ if(src0[number] > max){
++ index = number;
++ max = src0[number];
++ }
++ }
++ target[0] = (uint32_t)index;
++ }
++}
++
++#endif /*LV_HAVE_SSE4_1*/
++
++
++#ifdef LV_HAVE_SSE
++
++#include<xmmintrin.h>
++
++static inline void
++volk_32f_index_max_32u_a_sse(uint32_t* target, const float* src0, uint32_t num_points)
++{
++ if(num_points > 0){
++ uint32_t number = 0;
++ const uint32_t quarterPoints = num_points / 4;
++
++ float* inputPtr = (float*)src0;
++
++ __m128 indexIncrementValues = _mm_set1_ps(4);
++ __m128 currentIndexes = _mm_set_ps(-1,-2,-3,-4);
++
++ float max = src0[0];
++ float index = 0;
++ __m128 maxValues = _mm_set1_ps(max);
++ __m128 maxValuesIndex = _mm_setzero_ps();
++ __m128 compareResults;
++ __m128 currentValues;
++
++ __VOLK_ATTR_ALIGNED(16) float maxValuesBuffer[4];
++ __VOLK_ATTR_ALIGNED(16) float maxIndexesBuffer[4];
++
++ for(;number < quarterPoints; number++){
++
++ currentValues = _mm_load_ps(inputPtr); inputPtr += 4;
++ currentIndexes = _mm_add_ps(currentIndexes, indexIncrementValues);
++
++ compareResults = _mm_cmpgt_ps(maxValues, currentValues);
++
++ maxValuesIndex = _mm_or_ps(_mm_and_ps(compareResults, maxValuesIndex) , _mm_andnot_ps(compareResults, currentIndexes));
++
++ maxValues = _mm_or_ps(_mm_and_ps(compareResults, maxValues) , _mm_andnot_ps(compareResults, currentValues));
++ }
++
++ // Calculate the largest value from the remaining 4 points
++ _mm_store_ps(maxValuesBuffer, maxValues);
++ _mm_store_ps(maxIndexesBuffer, maxValuesIndex);
++
++ for(number = 0; number < 4; number++){
++ if(maxValuesBuffer[number] > max){
++ index = maxIndexesBuffer[number];
++ max = maxValuesBuffer[number];
++ }
++ }
++
++ number = quarterPoints * 4;
++ for(;number < num_points; number++){
++ if(src0[number] > max){
++ index = number;
++ max = src0[number];
++ }
++ }
++ target[0] = (uint32_t)index;
++ }
++}
++
++#endif /*LV_HAVE_SSE*/
++
++
++#ifdef LV_HAVE_GENERIC
++
++static inline void
++volk_32f_index_max_32u_generic(uint32_t* target, const float* src0, uint32_t num_points)
++{
++ if(num_points > 0){
++ float max = src0[0];
++ uint32_t index = 0;
++
++ uint32_t i = 1;
++
++ for(; i < num_points; ++i) {
++ if(src0[i] > max){
++ index = i;
++ max = src0[i];
++ }
++ }
++ target[0] = index;
++ }
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++
++#endif /*INCLUDED_volk_32f_index_max_32u_a_H*/
+--- kernels/volk/volk_32fc_index_max_16u.h.orig
++++ kernels/volk/volk_32fc_index_max_16u.h
+@@ -28,9 +28,15 @@
+ * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
+ * maximum magnitude for complex points in the given vector.
+ *
++ * Note that num_points is a uint32_t, but the return value is
++ * uint16_t. Providing a vector larger than the max of a uint16_t
++ * (65536) would miss anything outside of this boundary. The kernel
++ * will check the length of num_points and cap it to this max value,
++ * anyways.
++ *
+ * <b>Dispatcher Prototype</b>
+ * \code
+- * void volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0, unsigned int num_points)
++ * void volk_32fc_index_max_16u(uint16_t* target, lv_32fc_t* src0, uint32_t num_points)
+ * \endcode
+ *
+ * \b Inputs
+@@ -45,11 +51,11 @@
+ * the unit circle.
+ * \code
+ * int N = 10;
+- * unsigned int alignment = volk_get_alignment();
++ * uint32_t alignment = volk_get_alignment();
+ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
+- * uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
++ * uint16_t* max = (uint16_t*)volk_malloc(sizeof(uint16_t), alignment);
+ *
+- * for(unsigned int ii = 0; ii < N/2; ++ii){
++ * for(uint32_t ii = 0; ii < N/2; ++ii){
+ * float real = 2.f * ((float)ii / (float)N) - 1.f;
+ * float imag = std::sqrt(1.f - real * real);
+ * in[ii] = lv_cmake(real, imag);
+@@ -71,19 +77,24 @@
+ #define INCLUDED_volk_32fc_index_max_16u_a_H
+
+ #include <volk/volk_common.h>
+-#include<inttypes.h>
+-#include<stdio.h>
+-#include<volk/volk_complex.h>
++#include <inttypes.h>
++#include <stdio.h>
++#include <limits.h>
++#include <volk/volk_complex.h>
+
+ #ifdef LV_HAVE_SSE3
+-#include<xmmintrin.h>
+-#include<pmmintrin.h>
++#include <xmmintrin.h>
++#include <pmmintrin.h>
+
+ static inline void
+-volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0,
+- unsigned int num_points)
++volk_32fc_index_max_16u_a_sse3(uint16_t* target, lv_32fc_t* src0,
++ uint32_t num_points)
+ {
+- const unsigned int num_bytes = num_points*8;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
++ // Branchless version, if we think it'll make a difference
++ //num_points = USHRT_MAX ^ ((num_points ^ USHRT_MAX) & -(num_points < USHRT_MAX));
++
++ const uint32_t num_bytes = num_points*8;
+
+ union bit128 holderf;
+ union bit128 holderi;
+@@ -206,11 +217,11 @@ volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0,
+ /*
+ float placeholder = 0.0;
+ uint32_t temp0, temp1;
+- unsigned int g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
+- unsigned int l0 = g0 ^ 1;
++ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
++ uint32_t l0 = g0 ^ 1;
+
+- unsigned int g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
+- unsigned int l1 = g1 ^ 1;
++ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
++ uint32_t l1 = g1 ^ 1;
+
+ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
+ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
+@@ -227,16 +238,18 @@ volk_32fc_index_max_16u_a_sse3(unsigned int* target, lv_32fc_t* src0,
+
+ #ifdef LV_HAVE_GENERIC
+ static inline void
+- volk_32fc_index_max_16u_generic(unsigned int* target, lv_32fc_t* src0,
+- unsigned int num_points)
++ volk_32fc_index_max_16u_generic(uint16_t* target, lv_32fc_t* src0,
++ uint32_t num_points)
+ {
+- const unsigned int num_bytes = num_points*8;
++ num_points = (num_points > USHRT_MAX) ? USHRT_MAX : num_points;
++
++ const uint32_t num_bytes = num_points*8;
+
+ float sq_dist = 0.0;
+ float max = 0.0;
+- unsigned int index = 0;
++ uint16_t index = 0;
+
+- unsigned int i = 0;
++ uint32_t i = 0;
+
+ for(; i < num_bytes >> 3; ++i) {
+ sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
+--- /dev/null
++++ kernels/volk/volk_32fc_index_max_32u.h
+@@ -0,0 +1,253 @@
++/* -*- c++ -*- */
++/*
++ * Copyright 2016 Free Software Foundation, Inc.
++ *
++ * This file is part of GNU Radio
++ *
++ * GNU Radio is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 3, or (at your option)
++ * any later version.
++ *
++ * GNU Radio is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNU Radio; see the file COPYING. If not, write to
++ * the Free Software Foundation, Inc., 51 Franklin Street,
++ * Boston, MA 02110-1301, USA.
++ */
++
++/*!
++ * \page volk_32fc_index_max_32u
++ *
++ * \b Overview
++ *
++ * Returns Argmax_i mag(x[i]). Finds and returns the index which contains the
++ * maximum magnitude for complex points in the given vector.
++ *
++ * <b>Dispatcher Prototype</b>
++ * \code
++ * void volk_32fc_index_max_32u(uint32_t* target, lv_32fc_t* src0, uint32_t num_points)
++ * \endcode
++ *
++ * \b Inputs
++ * \li src0: The complex input vector.
++ * \li num_points: The number of samples.
++ *
++ * \b Outputs
++ * \li target: The index of the point with maximum magnitude.
++ *
++ * \b Example
++ * Calculate the index of the maximum value of \f$x^2 + x\f$ for points around
++ * the unit circle.
++ * \code
++ * int N = 10;
++ * uint32_t alignment = volk_get_alignment();
++ * lv_32fc_t* in = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
++ * uint32_t* max = (uint32_t*)volk_malloc(sizeof(uint32_t), alignment);
++ *
++ * for(uint32_t ii = 0; ii < N/2; ++ii){
++ * float real = 2.f * ((float)ii / (float)N) - 1.f;
++ * float imag = std::sqrt(1.f - real * real);
++ * in[ii] = lv_cmake(real, imag);
++ * in[ii] = in[ii] * in[ii] + in[ii];
++ * in[N-ii] = lv_cmake(real, imag);
++ * in[N-ii] = in[N-ii] * in[N-ii] + in[N-ii];
++ * }
++ *
++ * volk_32fc_index_max_32u(max, in, N);
++ *
++ * printf("index of max value = %u\n", *max);
++ *
++ * volk_free(in);
++ * volk_free(max);
++ * \endcode
++ */
++
++#ifndef INCLUDED_volk_32fc_index_max_32u_a_H
++#define INCLUDED_volk_32fc_index_max_32u_a_H
++
++#include <volk/volk_common.h>
++#include<inttypes.h>
++#include<stdio.h>
++#include<volk/volk_complex.h>
++
++#ifdef LV_HAVE_SSE3
++#include<xmmintrin.h>
++#include<pmmintrin.h>
++
++static inline void
++volk_32fc_index_max_32u_a_sse3(uint32_t* target, lv_32fc_t* src0,
++ uint32_t num_points)
++{
++ const uint32_t num_bytes = num_points*8;
++
++ union bit128 holderf;
++ union bit128 holderi;
++ float sq_dist = 0.0;
++
++ union bit128 xmm5, xmm4;
++ __m128 xmm1, xmm2, xmm3;
++ __m128i xmm8, xmm11, xmm12, xmmfive, xmmfour, xmm9, holder0, holder1, xmm10;
++
++ xmm5.int_vec = xmmfive = _mm_setzero_si128();
++ xmm4.int_vec = xmmfour = _mm_setzero_si128();
++ holderf.int_vec = holder0 = _mm_setzero_si128();
++ holderi.int_vec = holder1 = _mm_setzero_si128();
++
++ int bound = num_bytes >> 5;
++ int leftovers0 = (num_bytes >> 4) & 1;
++ int leftovers1 = (num_bytes >> 3) & 1;
++ int i = 0;
++
++ xmm8 = _mm_set_epi32(3, 2, 1, 0);//remember the crazy reverse order!
++ xmm9 = xmm8 = _mm_setzero_si128();
++ xmm10 = _mm_set_epi32(4, 4, 4, 4);
++ xmm3 = _mm_setzero_ps();
++
++ //printf("%f, %f, %f, %f\n", ((float*)&xmm10)[0], ((float*)&xmm10)[1], ((float*)&xmm10)[2], ((float*)&xmm10)[3]);
++
++ for(; i < bound; ++i) {
++ xmm1 = _mm_load_ps((float*)src0);
++ xmm2 = _mm_load_ps((float*)&src0[2]);
++
++ src0 += 4;
++
++ xmm1 = _mm_mul_ps(xmm1, xmm1);
++ xmm2 = _mm_mul_ps(xmm2, xmm2);
++
++ xmm1 = _mm_hadd_ps(xmm1, xmm2);
++
++ xmm3 = _mm_max_ps(xmm1, xmm3);
++
++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
++
++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
++
++ xmm9 = _mm_add_epi32(xmm11, xmm12);
++
++ xmm8 = _mm_add_epi32(xmm8, xmm10);
++
++ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
++ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm10)[0], ((uint32_t*)&xmm10)[1], ((uint32_t*)&xmm10)[2], ((uint32_t*)&xmm10)[3]);
++ }
++
++
++ for(i = 0; i < leftovers0; ++i) {
++ xmm2 = _mm_load_ps((float*)src0);
++
++ xmm1 = _mm_movelh_ps(bit128_p(&xmm8)->float_vec, bit128_p(&xmm8)->float_vec);
++ xmm8 = bit128_p(&xmm1)->int_vec;
++
++ xmm2 = _mm_mul_ps(xmm2, xmm2);
++
++ src0 += 2;
++
++ xmm1 = _mm_hadd_ps(xmm2, xmm2);
++
++ xmm3 = _mm_max_ps(xmm1, xmm3);
++
++ xmm10 = _mm_set_epi32(2, 2, 2, 2);//load1_ps((float*)&init[2]);
++
++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
++
++ xmm11 = _mm_and_si128(xmm8, xmm5.int_vec);
++ xmm12 = _mm_and_si128(xmm9, xmm4.int_vec);
++
++ xmm9 = _mm_add_epi32(xmm11, xmm12);
++
++ xmm8 = _mm_add_epi32(xmm8, xmm10);
++ //printf("egads%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++ }
++
++ for(i = 0; i < leftovers1; ++i) {
++ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++
++ sq_dist = lv_creal(src0[0]) * lv_creal(src0[0]) + lv_cimag(src0[0]) * lv_cimag(src0[0]);
++
++ xmm2 = _mm_load1_ps(&sq_dist);
++
++ xmm1 = xmm3;
++
++ xmm3 = _mm_max_ss(xmm3, xmm2);
++
++ xmm4.float_vec = _mm_cmplt_ps(xmm1, xmm3);
++ xmm5.float_vec = _mm_cmpeq_ps(xmm1, xmm3);
++
++ xmm8 = _mm_shuffle_epi32(xmm8, 0x00);
++
++ xmm11 = _mm_and_si128(xmm8, xmm4.int_vec);
++ xmm12 = _mm_and_si128(xmm9, xmm5.int_vec);
++
++ xmm9 = _mm_add_epi32(xmm11, xmm12);
++ }
++
++ //printf("%f, %f, %f, %f\n", ((float*)&xmm3)[0], ((float*)&xmm3)[1], ((float*)&xmm3)[2], ((float*)&xmm3)[3]);
++ //printf("%u, %u, %u, %u\n", ((uint32_t*)&xmm9)[0], ((uint32_t*)&xmm9)[1], ((uint32_t*)&xmm9)[2], ((uint32_t*)&xmm9)[3]);
++
++ _mm_store_ps((float*)&(holderf.f), xmm3);
++ _mm_store_si128(&(holderi.int_vec), xmm9);
++
++ target[0] = holderi.i[0];
++ sq_dist = holderf.f[0];
++ target[0] = (holderf.f[1] > sq_dist) ? holderi.i[1] : target[0];
++ sq_dist = (holderf.f[1] > sq_dist) ? holderf.f[1] : sq_dist;
++ target[0] = (holderf.f[2] > sq_dist) ? holderi.i[2] : target[0];
++ sq_dist = (holderf.f[2] > sq_dist) ? holderf.f[2] : sq_dist;
++ target[0] = (holderf.f[3] > sq_dist) ? holderi.i[3] : target[0];
++ sq_dist = (holderf.f[3] > sq_dist) ? holderf.f[3] : sq_dist;
++
++ /*
++ float placeholder = 0.0;
++ uint32_t temp0, temp1;
++ uint32_t g0 = (((float*)&xmm3)[0] > ((float*)&xmm3)[1]);
++ uint32_t l0 = g0 ^ 1;
++
++ uint32_t g1 = (((float*)&xmm3)[1] > ((float*)&xmm3)[2]);
++ uint32_t l1 = g1 ^ 1;
++
++ temp0 = g0 * ((uint32_t*)&xmm9)[0] + l0 * ((uint32_t*)&xmm9)[1];
++ temp1 = g0 * ((uint32_t*)&xmm9)[2] + l0 * ((uint32_t*)&xmm9)[3];
++ sq_dist = g0 * ((float*)&xmm3)[0] + l0 * ((float*)&xmm3)[1];
++ placeholder = g0 * ((float*)&xmm3)[2] + l0 * ((float*)&xmm3)[3];
++
++ g0 = (sq_dist > placeholder);
++ l0 = g0 ^ 1;
++ target[0] = g0 * temp0 + l0 * temp1;
++ */
++}
++
++#endif /*LV_HAVE_SSE3*/
++
++#ifdef LV_HAVE_GENERIC
++static inline void
++ volk_32fc_index_max_32u_generic(uint32_t* target, lv_32fc_t* src0,
++ uint32_t num_points)
++{
++ const uint32_t num_bytes = num_points*8;
++
++ float sq_dist = 0.0;
++ float max = 0.0;
++ uint32_t index = 0;
++
++ uint32_t i = 0;
++
++ for(; i < num_bytes >> 3; ++i) {
++ sq_dist = lv_creal(src0[i]) * lv_creal(src0[i]) + lv_cimag(src0[i]) * lv_cimag(src0[i]);
++
++ index = sq_dist > max ? i : index;
++ max = sq_dist > max ? sq_dist : max;
++ }
++ target[0] = index;
++}
++
++#endif /*LV_HAVE_GENERIC*/
++
++
++#endif /*INCLUDED_volk_32fc_index_max_32u_a_H*/
+--- /dev/null
++++ kernels/volk/volk_32fc_x2_divide_32fc.h
+@@ -0,0 +1,226 @@
++/* -*- c++ -*- */
++/*
++ * Copyright 2016 Free Software Foundation, Inc.
++ *
++ * This file is part of GNU Radio
++ *
++ * GNU Radio is free software; you can redistribute it and/or modify
++ * it under the terms of the GNU General Public License as published by
++ * the Free Software Foundation; either version 3, or (at your option)
++ * any later version.
++ *
++ * GNU Radio is distributed in the hope that it will be useful,
++ * but WITHOUT ANY WARRANTY; without even the implied warranty of
++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ * GNU General Public License for more details.
++ *
++ * You should have received a copy of the GNU General Public License
++ * along with GNU Radio; see the file COPYING. If not, write to
++ * the Free Software Foundation, Inc., 51 Franklin Street,
++ * Boston, MA 02110-1301, USA.
++ */
++
++/*!
++ * \page volk_32fc_x2_divide_32fc
++ *
++ * \b Overview
++ *
++ * Divide first vector of complexes element-wise by second.
++ *
++ * <b>Dispatcher Prototype</b>
++ * \code
++ * void volk_32fc_x2_divide_32fc(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector, const lv_32fc_t* denumeratorVector, unsigned int num_points);
++ * \endcode
++ *
++ * \b Inputs
++ * \li numeratorVector: The numerator complex values.
++ * \li numeratorVector: The denumerator complex values.
++ * \li num_points: The number of data points.
++ *
++ * \b Outputs
++ * \li outputVector: The output vector complex floats.
++ *
++ * \b Example
++ * divide a complex vector by itself, demonstrating the result should be pretty close to 1+0j.
++ *
++ * \code
++ * int N = 10;
++ * unsigned int alignment = volk_get_alignment();
++ * lv_32fc_t* input_vector = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
++ * lv_32fc_t* out = (lv_32fc_t*)volk_malloc(sizeof(lv_32fc_t)*N, alignment);
++ *
++ * float delta = 2.f*M_PI / (float)N;
++ * for(unsigned int ii = 0; ii < N; ++ii){
++ * float real_1 = std::cos(0.3f * (float)ii);
++ * float imag_1 = std::sin(0.3f * (float)ii);
++ * input_vector[ii] = lv_cmake(real_1, imag_1);
++ * }
++ *
++ * volk_32fc_x2_divide_32fc(out, input_vector, input_vector, N);
++ *
++ * for(unsigned int ii = 0; ii < N; ++ii){
++ * printf("%1.4f%+1.4fj,", lv_creal(out[ii]), lv_cimag(out[ii]));
++ * }
++ * printf("\n");
++ *
++ * volk_free(input_vector);
++ * volk_free(out);
++ * \endcode
++ */
++
++#ifndef INCLUDED_volk_32fc_x2_divide_32fc_u_H
++#define INCLUDED_volk_32fc_x2_divide_32fc_u_H
++
++#include <inttypes.h>
++#include <volk/volk_complex.h>
++#include <float.h>
++
++#ifdef LV_HAVE_AVX
++#include <immintrin.h>
++#include <volk/volk_avx_intrinsics.h>
++
++static inline void
++volk_32fc_x2_divide_32fc_u_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
++ const lv_32fc_t* denumeratorVector, unsigned int num_points)
++{
++ /*
++ * we'll do the "classical"
++ * a a b*
++ * --- = -------
++ * b |b|^2
++ * */
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = numeratorVector;
++ const lv_32fc_t* b = denumeratorVector;
++
++ for(; number < quarterPoints; number++){
++ num = _mm256_loadu_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
++ denum = _mm256_loadu_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
++ mul_conj = _mm256_complexconjugatemul_ps(num, denum);
++ sq = _mm256_mul_ps(denum, denum); // Square the values
++ mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
++ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
++ // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
++ div = _mm256_div_ps(mul_conj,mag_sq);
++
++ _mm256_storeu_ps((float*) c, div); // Store the results back into the C container
++
++ a += 4;
++ b += 4;
++ c += 4;
++ }
++
++ number = quarterPoints * 4;
++
++ for(; number < num_points; number++){
++ *c++ = (*a++) / (*b++);
++ }
++
++}
++#endif /* LV_HAVE_AVX */
++
++
++#ifdef LV_HAVE_GENERIC
++
++static inline void
++volk_32fc_x2_divide_32fc_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector, unsigned int num_points)
++{
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr= bVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++
++#endif /* INCLUDED_volk_32fc_x2_divide_32fc_u_H */
++
++
++#ifndef INCLUDED_volk_32fc_x2_divide_32fc_a_H
++#define INCLUDED_volk_32fc_x2_divide_32fc_a_H
++
++#include <inttypes.h>
++#include <stdio.h>
++#include <volk/volk_complex.h>
++#include <float.h>
++
++
++#ifdef LV_HAVE_AVX
++#include <immintrin.h>
++#include <volk/volk_avx_intrinsics.h>
++
++static inline void
++volk_32fc_x2_divide_32fc_a_avx(lv_32fc_t* cVector, const lv_32fc_t* numeratorVector,
++ const lv_32fc_t* denumeratorVector, unsigned int num_points)
++{
++ /*
++ * we'll do the "classical"
++ * a a b*
++ * --- = -------
++ * b |b|^2
++ * */
++ unsigned int number = 0;
++ const unsigned int quarterPoints = num_points / 4;
++
++ __m256 num, denum, mul_conj, sq, mag_sq, mag_sq_un, div;
++ lv_32fc_t* c = cVector;
++ const lv_32fc_t* a = numeratorVector;
++ const lv_32fc_t* b = denumeratorVector;
++
++ for(; number < quarterPoints; number++){
++ num = _mm256_load_ps((float*) a); // Load the ar + ai, br + bi ... as ar,ai,br,bi ...
++ denum = _mm256_load_ps((float*) b); // Load the cr + ci, dr + di ... as cr,ci,dr,di ...
++ mul_conj = _mm256_complexconjugatemul_ps(num, denum);
++ sq = _mm256_mul_ps(denum, denum); // Square the values
++ mag_sq_un = _mm256_hadd_ps(sq,sq); // obtain the actual squared magnitude, although out of order
++ mag_sq = _mm256_permute_ps(mag_sq_un, 0xd8); // I order them
++ // best guide I found on using these functions: https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=2738,2059,2738,2738,3875,3874,3875,2738,3870
++ div = _mm256_div_ps(mul_conj,mag_sq);
++
++ _mm256_store_ps((float*) c, div); // Store the results back into the C container
++
++ a += 4;
++ b += 4;
++ c += 4;
++ }
++
++ number = quarterPoints * 4;
++
++ for(; number < num_points; number++){
++ *c++ = (*a++) / (*b++);
++ }
++
++
++}
++#endif /* LV_HAVE_AVX */
++
++
++#ifdef LV_HAVE_GENERIC
++
++static inline void
++volk_32fc_x2_divide_32fc_a_generic(lv_32fc_t* cVector, const lv_32fc_t* aVector,
++ const lv_32fc_t* bVector, unsigned int num_points)
++{
++ lv_32fc_t* cPtr = cVector;
++ const lv_32fc_t* aPtr = aVector;
++ const lv_32fc_t* bPtr= bVector;
++ unsigned int number = 0;
++
++ for(number = 0; number < num_points; number++){
++ *cPtr++ = (*aPtr++) / (*bPtr++);
++ }
++}
++#endif /* LV_HAVE_GENERIC */
++
++
++#endif /* INCLUDED_volk_32fc_x2_divide_32fc_a_H */
+--- lib/CMakeLists.txt.orig
++++ lib/CMakeLists.txt
+@@ -383,7 +383,7 @@ foreach(machine_name ${available_machines})
+ )
+ MESSAGE(STATUS "BUILD INFO ::: ${machine_name} ::: ${COMPILER_NAME} ::: ${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}")
+ set(COMPILER_INFO "${COMPILER_INFO}${machine_name}:::${COMPILER_NAME}:::${CMAKE_C_FLAGS_${CBTU}} ${CMAKE_C_FLAGS} ${${machine_name}_flags}\n" )
+- if(${machine_name}_flags)
++ if(${machine_name}_flags AND NOT MSVC)
+ set_source_files_properties(${machine_source} PROPERTIES COMPILE_FLAGS "${${machine_name}_flags}")
+ endif()
+
+--- lib/kernel_tests.h.orig
++++ lib/kernel_tests.h
+@@ -50,6 +50,7 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
+ (VOLK_INIT_TEST(volk_32f_accumulator_s32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32f_x2_add_32f, test_params))
+ (VOLK_INIT_TEST(volk_32f_index_max_16u, test_params))
++ (VOLK_INIT_TEST(volk_32f_index_max_32u, test_params))
+ (VOLK_INIT_TEST(volk_32fc_32f_multiply_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32f_log2_32f, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32f_expfast_32f, volk_test_params_t(1e-1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+@@ -73,11 +74,13 @@ std::vector<volk_test_case_t> init_test_list(volk_test_params_t test_params)
+ (VOLK_INIT_TEST(volk_32fc_x2_dot_prod_32fc, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32fc_32f_dot_prod_32fc, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32fc_index_max_16u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
++ (VOLK_INIT_TEST(volk_32fc_index_max_32u, volk_test_params_t(3, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+ (VOLK_INIT_TEST(volk_32fc_s32f_magnitude_16i, test_params_int1))
+ (VOLK_INIT_TEST(volk_32fc_magnitude_32f, test_params_inacc))
+ (VOLK_INIT_TEST(volk_32fc_magnitude_squared_32f, test_params))
+ (VOLK_INIT_TEST(volk_32fc_x2_multiply_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32fc_x2_multiply_conjugate_32fc, test_params))
++ (VOLK_INIT_TEST(volk_32fc_x2_divide_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32fc_conjugate_32fc, test_params))
+ (VOLK_INIT_TEST(volk_32f_s32f_convert_16i, test_params))
+ (VOLK_INIT_TEST(volk_32f_s32f_convert_32i, volk_test_params_t(1, test_params.scalar(), test_params.vlen(), test_params.iter(), test_params.benchmark_mode(), test_params.kernel_regex())))
+--- lib/volk_rank_archs.c.orig
++++ lib/volk_rank_archs.c
+@@ -38,7 +38,7 @@ int volk_get_index(
+ }
+ //TODO return -1;
+ //something terrible should happen here
+- printf("Volk warning: no arch found, returning generic impl\n");
++ fprintf(stderr, "Volk warning: no arch found, returning generic impl\n");
+ return volk_get_index(impl_names, n_impls, "generic"); //but we'll fake it for now
+ }
+
+--- tmpl/volk.tmpl.c.orig
++++ tmpl/volk.tmpl.c
+@@ -53,7 +53,7 @@ struct volk_machine *get_machine(void)
+ }
+ }
+ machine = max_machine;
+- printf("Using Volk machine: %s\n", machine->name);
++ //printf("Using Volk machine: %s\n", machine->name);
+ __alignment = machine->alignment;
+ __alignment_mask = (intptr_t)(__alignment-1);
+ return machine;
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.macosforge.org/pipermail/macports-changes/attachments/20160531/58001f08/attachment-0001.html>
More information about the macports-changes
mailing list